1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
57 #define SS_START_OF_LINE (1u << 0)
58 #define SS_START_OF_COMMAND (1u << 1)
60 static int segmenter_detect_command_name__ (const char *input,
61 size_t n, bool eof, int ofs);
64 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
67 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
75 mblen = u8_mbtoucr (puc, input, n);
79 return u8_mbtouc (puc, input, n);
90 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
91 bool eof, enum segment_type *type)
99 for (int ofs = 2; ; ofs++)
106 else if (input[ofs] == '\n')
108 if (input[ofs - 1] == '\r')
114 s->state = S_GENERAL;
115 s->substate = SS_START_OF_COMMAND;
125 s->state = S_GENERAL;
126 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
127 return segmenter_push (s, input, n, eof, type);
131 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
132 const char *input, size_t n, bool eof,
133 enum segment_type *type)
135 assert (s->state == S_GENERAL);
141 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
145 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
147 for (; ofs < n; ofs++)
149 if (input[ofs] == '\n')
151 else if (input[ofs] == '*')
154 return eof ? ofs + 1 : -1;
155 else if (input[ofs + 1] == '/')
159 return eof ? ofs : -1;
163 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
170 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
177 return eof ? ofs : -1;
178 else if (input[ofs + 1] != '*')
181 ofs = skip_comment (input, n, eof, ofs + 2);
185 else if (lex_uc_is_space (uc) && uc != '\n')
191 return eof ? ofs : -1;
195 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
199 else if (input[ofs] == '\n')
201 else if (input[ofs] == '\r')
205 return input[ofs + 1] == '\n';
212 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
214 ofs = skip_spaces_and_comments (input, n, eof, ofs);
218 return is_end_of_line (input, n, eof, ofs);
222 is_all_spaces (const char *input_, size_t n)
224 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
227 for (int ofs = 0; ofs < n; ofs += mblen)
230 mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
231 if (!lex_uc_is_space (uc))
238 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
239 enum segment_type *type)
243 if (input[0] == '\n')
253 assert (input[0] == '\r');
254 assert (input[1] == '\n');
263 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
270 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
274 if (!lex_uc_is_space (uc) || uc == '\n')
280 return eof ? ofs : -1;
284 skip_digits (const char *input, size_t n, bool eof, int ofs)
286 for (; ofs < n; ofs++)
287 if (!c_isdigit (input[ofs]))
289 return eof ? ofs : -1;
293 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
294 bool eof, enum segment_type *type, int ofs)
296 assert (s->state == S_GENERAL);
298 ofs = skip_digits (input, n, eof, ofs);
308 if (input[ofs] == '.')
317 ofs = skip_digits (input, n, eof, ofs + 1);
324 if (input[ofs] == 'e' || input[ofs] == 'E')
331 goto expected_exponent;
334 if (input[ofs] == '+' || input[ofs] == '-')
341 goto expected_exponent;
345 if (!c_isdigit (input[ofs]))
346 goto expected_exponent;
348 ofs = skip_digits (input, n, eof, ofs);
353 if (input[ofs - 1] == '.')
355 int eol = at_end_of_line (input, n, eof, ofs);
368 *type = SEG_EXPECTED_EXPONENT;
374 is_reserved_word (const char *s, int n)
378 s0 = c_toupper (s[0]);
382 s1 = c_toupper (s[1]);
383 return ((s0 == 'B' && s1 == 'Y')
384 || (s0 == 'E' && s1 == 'Q')
385 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
386 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
387 || (s0 == 'N' && s1 == 'E')
388 || (s0 == 'O' && s1 == 'R')
389 || (s0 == 'T' && s1 == 'O'));
392 s1 = c_toupper (s[1]);
393 s2 = c_toupper (s[2]);
394 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
395 || (s1 == 'N' && s2 == 'D')))
396 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
399 s1 = c_toupper (s[1]);
400 s2 = c_toupper (s[2]);
401 s3 = c_toupper (s[3]);
402 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
410 segmenter_parse_comment_1__ (struct segmenter *s,
411 const char *input, size_t n, bool eof,
412 enum segment_type *type)
424 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
435 if (ofs > 1 && input[ofs - 1] == '\r')
439 /* Blank line ends comment command. */
440 s->state = S_GENERAL;
441 s->substate = SS_START_OF_COMMAND;
442 *type = SEG_SEPARATE_COMMANDS;
445 else if (endcmd >= 0)
447 /* '.' at end of line ends comment command. */
448 s->state = S_GENERAL;
450 *type = SEG_COMMENT_COMMAND;
455 /* Comment continues onto next line. */
456 *type = SEG_COMMENT_COMMAND;
457 s->state = S_COMMENT_2;
463 if (!lex_uc_is_space (uc))
474 s->state = S_GENERAL;
475 s->substate = SS_START_OF_COMMAND;
476 *type = SEG_SEPARATE_COMMANDS;
484 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
485 size_t n, bool eof, enum segment_type *type)
487 int ofs = segmenter_parse_newline__ (input, n, eof, type);
501 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
505 if (uc == '+' || uc == '-' || uc == '.')
507 else if (!lex_uc_is_space (uc))
510 case SEG_MODE_INTERACTIVE:
519 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
533 s->state = S_GENERAL;
534 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
537 s->state = S_COMMENT_1;
542 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
543 bool eof, enum segment_type *type)
555 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
566 if (ofs > 1 && input[ofs - 1] == '\r')
569 *type = SEG_DOCUMENT;
570 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
574 if (!lex_uc_is_space (uc))
583 *type = SEG_DOCUMENT;
584 s->state = S_DOCUMENT_3;
591 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
592 bool eof, enum segment_type *type)
596 ofs = segmenter_parse_newline__ (input, n, eof, type);
600 s->state = S_DOCUMENT_1;
605 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
607 *type = SEG_END_COMMAND;
608 s->state = S_GENERAL;
609 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
614 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
617 ofs = skip_spaces_and_comments (input, n, eof, ofs);
623 return c != '\'' && c != '"' && c != '\n';
633 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
634 bool eof, int ofs, char id[], size_t id_size)
636 struct segmenter sub;
638 assert (id_size > 0);
641 sub.state = S_GENERAL;
645 enum segment_type type;
648 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
664 if (retval < id_size)
666 memcpy (id, input + ofs, retval);
673 case SEG_QUOTED_STRING:
675 case SEG_UNICODE_STRING:
676 case SEG_UNQUOTED_STRING:
677 case SEG_RESERVED_WORD:
679 case SEG_COMMENT_COMMAND:
680 case SEG_DO_REPEAT_COMMAND:
681 case SEG_INLINE_DATA:
684 case SEG_START_DOCUMENT:
686 case SEG_START_COMMAND:
687 case SEG_SEPARATE_COMMANDS:
688 case SEG_END_COMMAND:
690 case SEG_EXPECTED_QUOTE:
691 case SEG_EXPECTED_EXPONENT:
692 case SEG_UNEXPECTED_CHAR:
700 /* Called when INPUT begins with a character that can start off an ID token. */
702 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
703 bool eof, enum segment_type *type)
709 assert (s->state == S_GENERAL);
711 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
723 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
726 else if (!lex_uc_is_idn (uc))
732 if (input[ofs - 1] == '.')
734 int eol = at_end_of_line (input, n, eof, ofs);
741 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
742 : input[0] == '!' ? SEG_MACRO_ID
745 if (s->substate & SS_START_OF_COMMAND)
747 struct substring word = ss_buffer (input, ofs);
749 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
751 s->state = S_COMMENT_1;
752 return segmenter_parse_comment_1__ (s, input, n, eof, type);
754 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
756 s->state = S_DOCUMENT_1;
757 *type = SEG_START_DOCUMENT;
760 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
762 s->state = S_DEFINE_1;
765 else if (lex_id_match (ss_cstr ("FILE"), word))
769 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
771 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
773 s->state = S_FILE_LABEL_1;
778 else if (lex_id_match (ss_cstr ("DO"), word))
782 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
784 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
786 s->state = S_DO_REPEAT_1;
791 else if (lex_id_match (ss_cstr ("BEGIN"), word))
796 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
799 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
803 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
809 else if (input[ofs2] == '.')
811 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
816 eol = is_end_of_line (input, n, eof, ofs2);
821 if (memchr (input, '\n', ofs2))
822 s->state = S_BEGIN_DATA_1;
824 s->state = S_BEGIN_DATA_2;
837 segmenter_parse_string__ (enum segment_type string_type,
838 int ofs, struct segmenter *s,
839 const char *input, size_t n, bool eof,
840 enum segment_type *type)
842 int quote = input[ofs];
846 if (input[ofs] == quote)
851 if (input[ofs] == quote)
864 else if (input[ofs] == '\n')
875 *type = SEG_EXPECTED_QUOTE;
881 segmenter_maybe_parse_string__ (enum segment_type string_type,
883 const char *input, size_t n, bool eof,
884 enum segment_type *type)
891 else if (input[1] == '\'' || input[1] == '"')
892 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
894 return segmenter_parse_id__ (s, input, n, eof, type);
898 segmenter_parse_mid_command__ (struct segmenter *s,
899 const char *input, size_t n, bool eof,
900 enum segment_type *type)
906 assert (s->state == S_GENERAL);
907 assert (!(s->substate & SS_START_OF_LINE));
909 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
916 s->substate |= SS_START_OF_LINE;
926 else if (input[1] == '*')
928 ofs = skip_comment (input, n, eof, 2);
941 ofs = skip_spaces (input, n, eof, 1);
944 else if (c_isdigit (input[ofs]))
945 return segmenter_parse_number__ (s, input, n, eof, type, ofs);
946 else if (input[ofs] == '.')
953 else if (c_isdigit (input[ofs + 1]))
954 return segmenter_parse_number__ (s, input, n, eof, type, ofs);
957 case '(': case ')': case ',': case '=':
958 case '[': case ']': case '&': case '|': case '+':
964 if (s->substate & SS_START_OF_COMMAND)
966 /* '*' at the beginning of a command begins a comment. */
967 s->state = S_COMMENT_1;
968 return segmenter_parse_comment_1__ (s, input, n, eof, type);
971 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
974 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
977 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
980 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
988 else if (c_isdigit (input[1]))
989 return segmenter_parse_number__ (s, input, n, eof, type, 0);
991 int eol = at_end_of_line (input, n, eof, 1);
997 *type = SEG_END_COMMAND;
998 s->substate = SS_START_OF_COMMAND;
1004 case '0': case '1': case '2': case '3': case '4':
1005 case '5': case '6': case '7': case '8': case '9':
1006 return segmenter_parse_number__ (s, input, n, eof, type, 0);
1009 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
1010 s, input, n, eof, type);
1013 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
1014 s, input, n, eof, type);
1016 case '\'': case '"':
1017 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1018 s, input, n, eof, type);
1021 return segmenter_parse_id__ (s, input, n, eof, type);
1024 if (lex_uc_is_space (uc))
1026 ofs = skip_spaces (input, n, eof, mblen);
1030 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1034 s->substate |= SS_START_OF_LINE;
1035 *type = SEG_NEWLINE;
1044 else if (lex_uc_is_id1 (uc))
1045 return segmenter_parse_id__ (s, input, n, eof, type);
1046 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1054 *type = SEG_UNEXPECTED_CHAR;
1062 compare_commands (const void *a_, const void *b_)
1064 const char *const *ap = a_;
1065 const char *const *bp = b_;
1066 const char *a = *ap;
1067 const char *b = *bp;
1069 return c_strcasecmp (a, b);
1072 static const char **
1073 segmenter_get_command_name_candidates (unsigned char first)
1075 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1076 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1077 static const char *commands[] =
1079 #include "language/command.def"
1082 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1088 static const char **cindex[UCHAR_MAX + 1];
1096 qsort (commands, n_commands, sizeof *commands, compare_commands);
1097 for (i = 0; i < n_commands; i++)
1099 unsigned char c = c_toupper (commands[i][0]);
1100 if (cindex[c] == NULL)
1101 cindex[c] = &commands[i];
1103 for (i = 0; i <= UCHAR_MAX; i++)
1104 if (cindex[i] == NULL)
1105 cindex[i] = &commands[n_commands];
1108 return cindex[c_toupper (first)];
1112 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1115 const char **commands;
1132 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1137 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1145 if (input[ofs - 1] == '.')
1148 for (commands = segmenter_get_command_name_candidates (input[0]);
1149 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1155 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1156 &exact, &missing_words)
1157 && missing_words <= 0)
1165 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1168 return eof ? 0 : -1;
1171 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1174 return eof ? 0 : -1;
1176 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1179 return c == '\'' || c == '"' || c == '\n';
1183 segmenter_parse_start_of_line__ (struct segmenter *s,
1184 const char *input, size_t n, bool eof,
1185 enum segment_type *type)
1191 assert (s->state == S_GENERAL);
1192 assert (s->substate & SS_START_OF_LINE);
1194 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1201 ofs = skip_spaces_and_comments (input, n, eof, 1);
1206 int is_string = is_start_of_string__ (input, n, eof, ofs);
1211 /* This is punctuation that may separate pieces of a string. */
1221 *type = SEG_START_COMMAND;
1222 s->substate = SS_START_OF_COMMAND;
1226 if (lex_uc_is_space (uc))
1228 int eol = at_end_of_line (input, n, eof, 0);
1233 s->substate = SS_START_OF_COMMAND;
1234 *type = SEG_SEPARATE_COMMANDS;
1240 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1242 else if (s->mode == SEG_MODE_AUTO)
1244 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1251 assert (s->mode == SEG_MODE_BATCH);
1253 s->substate = SS_START_OF_COMMAND;
1254 *type = SEG_START_COMMAND;
1258 s->substate = SS_START_OF_COMMAND;
1259 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1263 segmenter_parse_file_label_1__ (struct segmenter *s,
1264 const char *input, size_t n, bool eof,
1265 enum segment_type *type)
1267 struct segmenter sub;
1271 sub.state = S_GENERAL;
1272 ofs = segmenter_push (&sub, input, n, eof, type);
1276 else if (*type == SEG_IDENTIFIER)
1280 assert (lex_id_match (ss_cstr ("LABEL"),
1281 ss_buffer ((char *) input, ofs)));
1282 result = segmenter_unquoted (input, n, eof, ofs);
1288 s->state = S_FILE_LABEL_2;
1296 s->substate = sub.substate;
1302 segmenter_parse_file_label_2__ (struct segmenter *s,
1303 const char *input, size_t n, bool eof,
1304 enum segment_type *type)
1308 ofs = skip_spaces (input, n, eof, 0);
1311 s->state = S_FILE_LABEL_3;
1317 segmenter_parse_file_label_3__ (struct segmenter *s,
1318 const char *input, size_t n, bool eof,
1319 enum segment_type *type)
1331 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1345 if (!lex_uc_is_space (uc))
1356 s->state = S_GENERAL;
1358 *type = SEG_UNQUOTED_STRING;
1359 return endcmd >= 0 ? endcmd : ofs;
1366 segmenter_subparse (struct segmenter *s,
1367 const char *input, size_t n, bool eof,
1368 enum segment_type *type)
1370 struct segmenter sub;
1374 sub.state = S_GENERAL;
1375 sub.substate = s->substate;
1376 ofs = segmenter_push (&sub, input, n, eof, type);
1377 s->substate = sub.substate;
1381 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1382 defines the stand-in variables (the head) before the lines of syntax to be
1383 repeated (the body). */
1385 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1386 const char *input, size_t n, bool eof,
1387 enum segment_type *type)
1389 int ofs = segmenter_subparse (s, input, n, eof, type);
1393 if (*type == SEG_SEPARATE_COMMANDS)
1395 /* We reached a blank line that separates the head from the body. */
1396 s->state = S_DO_REPEAT_2;
1398 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1400 /* We reached the body. */
1401 s->state = S_DO_REPEAT_3;
1408 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1409 separates the head from the body. */
1411 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1412 const char *input, size_t n, bool eof,
1413 enum segment_type *type)
1415 int ofs = segmenter_subparse (s, input, n, eof, type);
1419 if (*type == SEG_NEWLINE)
1421 /* We reached the body. */
1422 s->state = S_DO_REPEAT_3;
1430 check_repeat_command (struct segmenter *s,
1431 const char *input, size_t n, bool eof)
1438 if (input[ofs] == '+' || input[ofs] == '-')
1441 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1444 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1446 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1451 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1455 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1456 s->substate += direction;
1461 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1462 enum segment_type *type)
1464 const char *newline = memchr (input, '\n', n);
1466 return eof ? n : -1;
1468 ptrdiff_t ofs = newline - input;
1469 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1471 *type = SEG_NEWLINE;
1475 return ofs - (input[ofs - 1] == '\r');
1478 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1479 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1481 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1482 the lines we're segmenting. s->substate counts the nesting level, starting
1485 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1486 const char *input, size_t n, bool eof,
1487 enum segment_type *type)
1491 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1492 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1494 else if (!check_repeat_command (s, input, n, eof) && !eof)
1496 else if (s->substate == 0)
1498 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1500 s->state = S_GENERAL;
1501 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1502 return segmenter_push (s, input, n, eof, type);
1506 *type = SEG_DO_REPEAT_COMMAND;
1511 /* We are segmenting a DEFINE command, which consists of:
1513 - The DEFINE keyword.
1517 - "(" followed by a sequence of tokens possibly including balanced parentheses
1520 - A sequence of any number of lines, one string per line, ending with
1521 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1522 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1523 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1527 segmenter_parse_define_1__ (struct segmenter *s,
1528 const char *input, size_t n, bool eof,
1529 enum segment_type *type)
1531 int ofs = segmenter_subparse (s, input, n, eof, type);
1535 if (*type == SEG_SEPARATE_COMMANDS
1536 || *type == SEG_END_COMMAND
1537 || *type == SEG_START_COMMAND)
1539 /* The DEFINE command is malformed because we reached its end without
1540 ever hitting a "(" token. Transition back to general parsing. */
1541 s->state = S_GENERAL;
1544 else if (*type == SEG_PUNCT && input[0] == '(')
1546 s->state = S_DEFINE_2;
1555 segmenter_parse_define_2__ (struct segmenter *s,
1556 const char *input, size_t n, bool eof,
1557 enum segment_type *type)
1559 int ofs = segmenter_subparse (s, input, n, eof, type);
1563 if (*type == SEG_SEPARATE_COMMANDS
1564 || *type == SEG_END_COMMAND
1565 || *type == SEG_START_COMMAND)
1567 /* The DEFINE command is malformed because we reached its end before
1568 closing the set of parentheses. Transition back to general
1570 s->state = S_GENERAL;
1573 else if (*type == SEG_PUNCT && input[0] == '(')
1578 else if (*type == SEG_PUNCT && input[0] == ')')
1583 s->state = S_DEFINE_3;
1593 find_enddefine (struct substring input)
1595 size_t n = input.length;
1596 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1599 /* Skip !ENDDEFINE in comment. */
1600 ofs = skip_spaces_and_comments (input.string, n, true, ofs);
1601 if (ofs + enddefine.length > n)
1604 char c = input.string[ofs];
1606 && ss_equals_case (ss_substr (input, ofs, enddefine.length),
1609 else if (c == '\'' || c == '"')
1611 /* Skip quoted !ENDDEFINE. */
1617 else if (input.string[ofs++] == c)
1626 /* We are in the body of a macro definition, looking for additional lines of
1627 the body or !ENDDEFINE. */
1629 segmenter_parse_define_3__ (struct segmenter *s,
1630 const char *input, size_t n, bool eof,
1631 enum segment_type *type)
1633 /* Gather a whole line. */
1634 const char *newline = memchr (input, '\n', n);
1635 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1641 /* Does the line contain !ENDDEFINE? */
1642 size_t end = find_enddefine (ss_buffer (input, ofs));
1643 if (end == SIZE_MAX)
1645 /* No !ENDDEFINE. We have a full line of macro body.
1647 The line might be blank, whether completely empty or just spaces and
1648 comments. That's OK: we need to report blank lines because they can
1651 However, if the first line of the macro body (the same line as the
1652 closing parenthesis in the argument definition) is blank, we just
1653 report it as spaces because it's not significant. */
1654 *type = (s->substate == 0 && is_all_spaces (input, ofs)
1655 ? SEG_SPACES : SEG_MACRO_BODY);
1656 s->state = S_DEFINE_4;
1662 /* Macro ends at the !ENDDEFINE on this line. */
1663 s->state = S_GENERAL;
1667 /* Line starts with !ENDDEFINE. */
1668 return segmenter_push (s, input, n, eof, type);
1672 if (is_all_spaces (input, end))
1674 /* Line starts with spaces followed by !ENDDEFINE. */
1679 /* Line starts with some content followed by !ENDDEFINE. */
1680 *type = SEG_MACRO_BODY;
1688 segmenter_parse_define_4__ (struct segmenter *s,
1689 const char *input, size_t n, bool eof,
1690 enum segment_type *type)
1692 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1696 s->state = S_DEFINE_3;
1701 segmenter_parse_begin_data_1__ (struct segmenter *s,
1702 const char *input, size_t n, bool eof,
1703 enum segment_type *type)
1705 int ofs = segmenter_subparse (s, input, n, eof, type);
1709 if (*type == SEG_NEWLINE)
1710 s->state = S_BEGIN_DATA_2;
1716 segmenter_parse_begin_data_2__ (struct segmenter *s,
1717 const char *input, size_t n, bool eof,
1718 enum segment_type *type)
1720 int ofs = segmenter_subparse (s, input, n, eof, type);
1724 if (*type == SEG_NEWLINE)
1725 s->state = S_BEGIN_DATA_3;
1731 is_end_data (const char *input, size_t n)
1733 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1739 if (n < 4 || c_strncasecmp (input, "END", 3))
1743 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1744 if (!lex_uc_is_space (uc))
1748 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1755 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1762 else if (!lex_uc_is_space (uc))
1771 segmenter_parse_begin_data_3__ (struct segmenter *s,
1772 const char *input, size_t n, bool eof,
1773 enum segment_type *type)
1777 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1780 else if (is_end_data (input, ofs))
1782 s->state = S_GENERAL;
1783 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1784 return segmenter_push (s, input, n, eof, type);
1788 *type = SEG_INLINE_DATA;
1789 s->state = S_BEGIN_DATA_4;
1790 return input[ofs - 1] == '\n' ? 0 : ofs;
1795 segmenter_parse_begin_data_4__ (struct segmenter *s,
1796 const char *input, size_t n, bool eof,
1797 enum segment_type *type)
1801 ofs = segmenter_parse_newline__ (input, n, eof, type);
1805 s->state = S_BEGIN_DATA_3;
1809 /* Returns the name of segment TYPE as a string. The caller must not modify
1810 or free the returned string.
1812 This is useful only for debugging and testing. */
1814 segment_type_to_string (enum segment_type type)
1818 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1822 return "unknown segment type";
1826 /* Returns a segmenter with the given syntax MODE.
1828 If IS_SNIPPET is false, then the segmenter will parse as if it's being given
1829 a whole file. This means, for example, that it will interpret - or + at the
1830 beginning of the syntax as a separator between commands (since - or + at the
1831 beginning of a line has this meaning).
1833 If IS_SNIPPET is true, then the segmenter will parse as if it's being given
1834 an isolated piece of syntax. This means that, for example, that it will
1835 interpret - or + at the beginning of the syntax as an operator token or (if
1836 followed by a digit) as part of a number.
1838 A segmenter does not contain any external references, so nothing needs to be
1839 done to destroy one. For the same reason, segmenters may be copied with
1840 plain struct assignment (or memcpy). */
1842 segmenter_init (enum segmenter_mode mode, bool is_snippet)
1844 return (struct segmenter) {
1845 .state = is_snippet ? S_GENERAL : S_SHBANG,
1850 /* Returns the mode passed to segmenter_init() for S. */
1852 segmenter_get_mode (const struct segmenter *s)
1857 /* Attempts to label a prefix of S's remaining input with a segment type. The
1858 caller supplies the first N bytes of the remaining input as INPUT, which
1859 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1860 are the entire (remainder) of the input; if EOF is false, then further input
1861 is potentially available.
1863 The input may contain '\n' or '\r\n' line ends in any combination.
1865 If successful, returns the number of bytes in the segment at the beginning
1866 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1867 into *TYPE. The next call to segmenter_push() should not include those
1868 bytes as part of INPUT, because they have (figuratively) been consumed by
1871 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1872 be determined. In this case segmenter_push() returns -1. If more input is
1873 available, the caller should obtain some more, then call again with a larger
1874 N. If this is not enough, the process might need to repeat again and agin.
1875 If input is exhausted, then the caller may call again setting EOF to true.
1876 segmenter_push() will never return -1 when EOF is true.
1878 The caller must not, in a sequence of calls, supply contradictory input.
1879 That is, bytes provided as part of INPUT in one call, but not consumed, must
1880 not be provided with *different* values on subsequent calls. This is
1881 because segmenter_push() must often make decisions based on looking ahead
1882 beyond the bytes that it consumes. */
1884 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1885 enum segment_type *type)
1901 return segmenter_parse_shbang__ (s, input, n, eof, type);
1904 return (s->substate & SS_START_OF_LINE
1905 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1906 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1909 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1911 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1914 return segmenter_parse_document_1__ (s, input, n, eof, type);
1916 return segmenter_parse_document_2__ (s, input, n, eof, type);
1918 return segmenter_parse_document_3__ (s, type);
1920 case S_FILE_LABEL_1:
1921 return segmenter_parse_file_label_1__ (s, input, n, eof, type);
1922 case S_FILE_LABEL_2:
1923 return segmenter_parse_file_label_2__ (s, input, n, eof, type);
1924 case S_FILE_LABEL_3:
1925 return segmenter_parse_file_label_3__ (s, input, n, eof, type);
1928 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1930 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1932 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1935 return segmenter_parse_define_1__ (s, input, n, eof, type);
1937 return segmenter_parse_define_2__ (s, input, n, eof, type);
1939 return segmenter_parse_define_3__ (s, input, n, eof, type);
1941 return segmenter_parse_define_4__ (s, input, n, eof, type);
1943 case S_BEGIN_DATA_1:
1944 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1945 case S_BEGIN_DATA_2:
1946 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1947 case S_BEGIN_DATA_3:
1948 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1949 case S_BEGIN_DATA_4:
1950 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1956 /* Returns the style of command prompt to display to an interactive user for
1957 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1958 and at the beginning of a line (that is, if segmenter_push() consumed as
1959 much as possible of the input up to a new-line). */
1961 segmenter_get_prompt (const struct segmenter *s)
1966 return PROMPT_FIRST;
1969 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1973 return PROMPT_COMMENT;
1977 return PROMPT_DOCUMENT;
1979 return PROMPT_FIRST;
1981 case S_FILE_LABEL_1:
1982 return PROMPT_LATER;
1983 case S_FILE_LABEL_2:
1984 case S_FILE_LABEL_3:
1985 return PROMPT_FIRST;
1989 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1991 return PROMPT_DO_REPEAT;
1995 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1998 return PROMPT_DEFINE;
2000 case S_BEGIN_DATA_1:
2001 return PROMPT_FIRST;
2002 case S_BEGIN_DATA_2:
2003 return PROMPT_LATER;
2004 case S_BEGIN_DATA_3:
2005 case S_BEGIN_DATA_4: