1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/segment.h"
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
31 #include "gl/verify.h"
58 /* S_SHBANG is the start state that SEGMENTER_INIT refers to as just 0. */
59 verify (S_SHBANG == 0);
61 #define SS_START_OF_LINE (1u << 0)
62 #define SS_START_OF_COMMAND (1u << 1)
64 static int segmenter_detect_command_name__ (const char *input,
65 size_t n, bool eof, int ofs);
68 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
71 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
79 mblen = u8_mbtoucr (puc, input, n);
83 return u8_mbtouc (puc, input, n);
94 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
95 bool eof, enum segment_type *type)
103 for (int ofs = 2; ; ofs++)
110 else if (input[ofs] == '\n')
112 if (input[ofs - 1] == '\r')
118 s->state = S_GENERAL;
119 s->substate = SS_START_OF_COMMAND;
129 s->state = S_GENERAL;
130 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
131 return segmenter_push (s, input, n, eof, type);
135 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
136 const char *input, size_t n, bool eof,
137 enum segment_type *type)
139 assert (s->state == S_GENERAL);
145 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
149 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
151 for (; ofs < n; ofs++)
153 if (input[ofs] == '\n')
155 else if (input[ofs] == '*')
158 return eof ? ofs + 1 : -1;
159 else if (input[ofs + 1] == '/')
163 return eof ? ofs : -1;
167 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
174 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
181 return eof ? ofs : -1;
182 else if (input[ofs + 1] != '*')
185 ofs = skip_comment (input, n, eof, ofs + 2);
189 else if (lex_uc_is_space (uc) && uc != '\n')
195 return eof ? ofs : -1;
199 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
203 else if (input[ofs] == '\n')
205 else if (input[ofs] == '\r')
209 return input[ofs + 1] == '\n';
216 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
218 ofs = skip_spaces_and_comments (input, n, eof, ofs);
222 return is_end_of_line (input, n, eof, ofs);
226 is_all_spaces (const char *input_, size_t n)
228 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
231 for (int ofs = 0; ofs < n; ofs += mblen)
234 mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
235 if (!lex_uc_is_space (uc))
242 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
243 enum segment_type *type)
247 if (input[0] == '\n')
257 assert (input[0] == '\r');
258 assert (input[1] == '\n');
267 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
274 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
278 if (!lex_uc_is_space (uc) || uc == '\n')
284 return eof ? ofs : -1;
288 skip_digits (const char *input, size_t n, bool eof, int ofs)
290 for (; ofs < n; ofs++)
291 if (!c_isdigit (input[ofs]))
293 return eof ? ofs : -1;
297 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
298 bool eof, enum segment_type *type)
302 assert (s->state == S_GENERAL);
304 ofs = skip_digits (input, n, eof, 0);
314 if (input[ofs] == '.')
323 ofs = skip_digits (input, n, eof, ofs + 1);
330 if (input[ofs] == 'e' || input[ofs] == 'E')
337 goto expected_exponent;
340 if (input[ofs] == '+' || input[ofs] == '-')
347 goto expected_exponent;
351 if (!c_isdigit (input[ofs]))
352 goto expected_exponent;
354 ofs = skip_digits (input, n, eof, ofs);
359 if (input[ofs - 1] == '.')
361 int eol = at_end_of_line (input, n, eof, ofs);
374 *type = SEG_EXPECTED_EXPONENT;
380 is_reserved_word (const char *s, int n)
384 s0 = c_toupper (s[0]);
388 s1 = c_toupper (s[1]);
389 return ((s0 == 'B' && s1 == 'Y')
390 || (s0 == 'E' && s1 == 'Q')
391 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
392 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
393 || (s0 == 'N' && s1 == 'E')
394 || (s0 == 'O' && s1 == 'R')
395 || (s0 == 'T' && s1 == 'O'));
398 s1 = c_toupper (s[1]);
399 s2 = c_toupper (s[2]);
400 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
401 || (s1 == 'N' && s2 == 'D')))
402 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
405 s1 = c_toupper (s[1]);
406 s2 = c_toupper (s[2]);
407 s3 = c_toupper (s[3]);
408 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
416 segmenter_parse_comment_1__ (struct segmenter *s,
417 const char *input, size_t n, bool eof,
418 enum segment_type *type)
430 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
441 if (ofs > 1 && input[ofs - 1] == '\r')
445 /* Blank line ends comment command. */
446 s->state = S_GENERAL;
447 s->substate = SS_START_OF_COMMAND;
448 *type = SEG_SEPARATE_COMMANDS;
451 else if (endcmd >= 0)
453 /* '.' at end of line ends comment command. */
454 s->state = S_GENERAL;
456 *type = SEG_COMMENT_COMMAND;
461 /* Comment continues onto next line. */
462 *type = SEG_COMMENT_COMMAND;
463 s->state = S_COMMENT_2;
469 if (!lex_uc_is_space (uc))
480 s->state = S_GENERAL;
481 s->substate = SS_START_OF_COMMAND;
482 *type = SEG_SEPARATE_COMMANDS;
490 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
491 size_t n, bool eof, enum segment_type *type)
493 int ofs = segmenter_parse_newline__ (input, n, eof, type);
507 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
511 if (uc == '+' || uc == '-' || uc == '.')
513 else if (!lex_uc_is_space (uc))
516 case SEG_MODE_INTERACTIVE:
525 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
539 s->state = S_GENERAL;
540 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
543 s->state = S_COMMENT_1;
548 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
549 bool eof, enum segment_type *type)
561 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
572 if (ofs > 1 && input[ofs - 1] == '\r')
575 *type = SEG_DOCUMENT;
576 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
580 if (!lex_uc_is_space (uc))
589 *type = SEG_DOCUMENT;
590 s->state = S_DOCUMENT_3;
597 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
598 bool eof, enum segment_type *type)
602 ofs = segmenter_parse_newline__ (input, n, eof, type);
606 s->state = S_DOCUMENT_1;
611 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
613 *type = SEG_END_COMMAND;
614 s->state = S_GENERAL;
615 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
620 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
623 ofs = skip_spaces_and_comments (input, n, eof, ofs);
629 return c != '\'' && c != '"' && c != '\n';
639 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
640 bool eof, int ofs, char id[], size_t id_size)
642 struct segmenter sub;
644 assert (id_size > 0);
647 sub.state = S_GENERAL;
651 enum segment_type type;
654 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
670 if (retval < id_size)
672 memcpy (id, input + ofs, retval);
679 case SEG_QUOTED_STRING:
681 case SEG_UNICODE_STRING:
682 case SEG_UNQUOTED_STRING:
683 case SEG_RESERVED_WORD:
685 case SEG_COMMENT_COMMAND:
686 case SEG_DO_REPEAT_COMMAND:
687 case SEG_INLINE_DATA:
690 case SEG_START_DOCUMENT:
692 case SEG_START_COMMAND:
693 case SEG_SEPARATE_COMMANDS:
694 case SEG_END_COMMAND:
696 case SEG_EXPECTED_QUOTE:
697 case SEG_EXPECTED_EXPONENT:
698 case SEG_UNEXPECTED_CHAR:
706 /* Called when INPUT begins with a character that can start off an ID token. */
708 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
709 bool eof, enum segment_type *type)
715 assert (s->state == S_GENERAL);
717 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
729 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
732 else if (!lex_uc_is_idn (uc))
738 if (input[ofs - 1] == '.')
740 int eol = at_end_of_line (input, n, eof, ofs);
747 *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
748 : input[0] == '!' ? SEG_MACRO_ID
751 if (s->substate & SS_START_OF_COMMAND)
753 struct substring word = ss_buffer (input, ofs);
755 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
757 s->state = S_COMMENT_1;
758 return segmenter_parse_comment_1__ (s, input, n, eof, type);
760 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
762 s->state = S_DOCUMENT_1;
763 *type = SEG_START_DOCUMENT;
766 else if (lex_id_match (ss_cstr ("TITLE"), word)
767 || lex_id_match (ss_cstr ("SUBTITLE"), word))
769 int result = segmenter_unquoted (input, n, eof, ofs);
774 s->state = S_TITLE_1;
778 else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
780 s->state = S_DEFINE_1;
783 else if (lex_id_match (ss_cstr ("FILE"), word))
787 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
789 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
791 s->state = S_FILE_LABEL;
796 else if (lex_id_match (ss_cstr ("DO"), word))
800 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
802 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
804 s->state = S_DO_REPEAT_1;
809 else if (lex_id_match (ss_cstr ("BEGIN"), word))
814 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
817 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
821 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
827 else if (input[ofs2] == '.')
829 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
834 eol = is_end_of_line (input, n, eof, ofs2);
839 if (memchr (input, '\n', ofs2))
840 s->state = S_BEGIN_DATA_1;
842 s->state = S_BEGIN_DATA_2;
855 segmenter_parse_string__ (enum segment_type string_type,
856 int ofs, struct segmenter *s,
857 const char *input, size_t n, bool eof,
858 enum segment_type *type)
860 int quote = input[ofs];
864 if (input[ofs] == quote)
869 if (input[ofs] == quote)
882 else if (input[ofs] == '\n')
893 *type = SEG_EXPECTED_QUOTE;
899 segmenter_maybe_parse_string__ (enum segment_type string_type,
901 const char *input, size_t n, bool eof,
902 enum segment_type *type)
909 else if (input[1] == '\'' || input[1] == '"')
910 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
912 return segmenter_parse_id__ (s, input, n, eof, type);
916 segmenter_parse_mid_command__ (struct segmenter *s,
917 const char *input, size_t n, bool eof,
918 enum segment_type *type)
924 assert (s->state == S_GENERAL);
925 assert (!(s->substate & SS_START_OF_LINE));
927 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
934 s->substate |= SS_START_OF_LINE;
944 else if (input[1] == '*')
946 ofs = skip_comment (input, n, eof, 2);
958 case '(': case ')': case ',': case '=': case '-':
959 case '[': case ']': case '&': case '|': case '+':
965 if (s->substate & SS_START_OF_COMMAND)
967 /* '*' at the beginning of a command begins a comment. */
968 s->state = S_COMMENT_1;
969 return segmenter_parse_comment_1__ (s, input, n, eof, type);
972 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
975 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
978 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
981 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
989 else if (c_isdigit (input[1]))
990 return segmenter_parse_number__ (s, input, n, eof, type);
992 int eol = at_end_of_line (input, n, eof, 1);
998 *type = SEG_END_COMMAND;
999 s->substate = SS_START_OF_COMMAND;
1005 case '0': case '1': case '2': case '3': case '4':
1006 case '5': case '6': case '7': case '8': case '9':
1007 return segmenter_parse_number__ (s, input, n, eof, type);
1010 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
1011 s, input, n, eof, type);
1014 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
1015 s, input, n, eof, type);
1017 case '\'': case '"':
1018 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
1019 s, input, n, eof, type);
1022 return segmenter_parse_id__ (s, input, n, eof, type);
1025 if (lex_uc_is_space (uc))
1027 ofs = skip_spaces (input, n, eof, mblen);
1031 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
1035 s->substate |= SS_START_OF_LINE;
1036 *type = SEG_NEWLINE;
1045 else if (lex_uc_is_id1 (uc))
1046 return segmenter_parse_id__ (s, input, n, eof, type);
1047 else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
1055 *type = SEG_UNEXPECTED_CHAR;
1063 compare_commands (const void *a_, const void *b_)
1065 const char *const *ap = a_;
1066 const char *const *bp = b_;
1067 const char *a = *ap;
1068 const char *b = *bp;
1070 return c_strcasecmp (a, b);
1073 static const char **
1074 segmenter_get_command_name_candidates (unsigned char first)
1076 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1077 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1078 static const char *commands[] =
1080 #include "language/command.def"
1083 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1089 static const char **cindex[UCHAR_MAX + 1];
1097 qsort (commands, n_commands, sizeof *commands, compare_commands);
1098 for (i = 0; i < n_commands; i++)
1100 unsigned char c = c_toupper (commands[i][0]);
1101 if (cindex[c] == NULL)
1102 cindex[c] = &commands[i];
1104 for (i = 0; i <= UCHAR_MAX; i++)
1105 if (cindex[i] == NULL)
1106 cindex[i] = &commands[n_commands];
1109 return cindex[c_toupper (first)];
1113 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1116 const char **commands;
1133 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1138 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1146 if (input[ofs - 1] == '.')
1149 for (commands = segmenter_get_command_name_candidates (input[0]);
1150 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1156 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1157 &exact, &missing_words)
1158 && missing_words <= 0)
1166 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1169 return eof ? 0 : -1;
1172 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1175 return eof ? 0 : -1;
1177 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1180 return c == '\'' || c == '"' || c == '\n';
1184 segmenter_parse_start_of_line__ (struct segmenter *s,
1185 const char *input, size_t n, bool eof,
1186 enum segment_type *type)
1192 assert (s->state == S_GENERAL);
1193 assert (s->substate & SS_START_OF_LINE);
1195 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1202 ofs = skip_spaces_and_comments (input, n, eof, 1);
1207 int is_string = is_start_of_string__ (input, n, eof, ofs);
1212 /* This is punctuation that may separate pieces of a string. */
1222 *type = SEG_START_COMMAND;
1223 s->substate = SS_START_OF_COMMAND;
1227 if (lex_uc_is_space (uc))
1229 int eol = at_end_of_line (input, n, eof, 0);
1234 s->substate = SS_START_OF_COMMAND;
1235 *type = SEG_SEPARATE_COMMANDS;
1241 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1243 else if (s->mode == SEG_MODE_AUTO)
1245 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1252 assert (s->mode == SEG_MODE_BATCH);
1254 s->substate = SS_START_OF_COMMAND;
1255 *type = SEG_START_COMMAND;
1259 s->substate = SS_START_OF_COMMAND;
1260 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1264 segmenter_parse_file_label__ (struct segmenter *s,
1265 const char *input, size_t n, bool eof,
1266 enum segment_type *type)
1268 struct segmenter sub;
1272 sub.state = S_GENERAL;
1273 ofs = segmenter_push (&sub, input, n, eof, type);
1277 else if (*type == SEG_IDENTIFIER)
1281 assert (lex_id_match (ss_cstr ("LABEL"),
1282 ss_buffer ((char *) input, ofs)));
1283 result = segmenter_unquoted (input, n, eof, ofs);
1289 s->state = S_TITLE_1;
1297 s->substate = sub.substate;
1303 segmenter_subparse (struct segmenter *s,
1304 const char *input, size_t n, bool eof,
1305 enum segment_type *type)
1307 struct segmenter sub;
1311 sub.state = S_GENERAL;
1312 sub.substate = s->substate;
1313 ofs = segmenter_push (&sub, input, n, eof, type);
1314 s->substate = sub.substate;
1318 /* We are segmenting a DO REPEAT command, currently reading the syntax that
1319 defines the stand-in variables (the head) before the lines of syntax to be
1320 repeated (the body). */
1322 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1323 const char *input, size_t n, bool eof,
1324 enum segment_type *type)
1326 int ofs = segmenter_subparse (s, input, n, eof, type);
1330 if (*type == SEG_SEPARATE_COMMANDS)
1332 /* We reached a blank line that separates the head from the body. */
1333 s->state = S_DO_REPEAT_2;
1335 else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
1337 /* We reached the body. */
1338 s->state = S_DO_REPEAT_3;
1345 /* We are segmenting a DO REPEAT command, currently reading a blank line that
1346 separates the head from the body. */
1348 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1349 const char *input, size_t n, bool eof,
1350 enum segment_type *type)
1352 int ofs = segmenter_subparse (s, input, n, eof, type);
1356 if (*type == SEG_NEWLINE)
1358 /* We reached the body. */
1359 s->state = S_DO_REPEAT_3;
1367 check_repeat_command (struct segmenter *s,
1368 const char *input, size_t n, bool eof)
1375 if (input[ofs] == '+' || input[ofs] == '-')
1378 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1381 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1383 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1388 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1392 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1393 s->substate += direction;
1398 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1399 enum segment_type *type)
1401 const char *newline = memchr (input, '\n', n);
1403 return eof ? n : -1;
1405 ptrdiff_t ofs = newline - input;
1406 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1408 *type = SEG_NEWLINE;
1412 return ofs - (input[ofs - 1] == '\r');
1415 /* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
1416 be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
1418 DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
1419 the lines we're segmenting. s->substate counts the nesting level, starting
1422 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1423 const char *input, size_t n, bool eof,
1424 enum segment_type *type)
1428 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1429 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1431 else if (!check_repeat_command (s, input, n, eof) && !eof)
1433 else if (s->substate == 0)
1435 /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
1437 s->state = S_GENERAL;
1438 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1439 return segmenter_push (s, input, n, eof, type);
1443 *type = SEG_DO_REPEAT_COMMAND;
1448 /* We are segmenting a DEFINE command, which consists of:
1450 - The DEFINE keyword.
1454 - "(" followed by a sequence of tokens possibly including balanced parentheses
1457 - A sequence of any number of lines, one string per line, ending with
1458 "!ENDDEFINE". The first line is usually blank (that is, a newline follows
1459 the "("). The last line usually just has "!ENDDEFINE." on it, but it can
1460 start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
1464 segmenter_parse_define_1__ (struct segmenter *s,
1465 const char *input, size_t n, bool eof,
1466 enum segment_type *type)
1468 int ofs = segmenter_subparse (s, input, n, eof, type);
1472 if (*type == SEG_SEPARATE_COMMANDS
1473 || *type == SEG_END_COMMAND
1474 || *type == SEG_START_COMMAND)
1476 /* The DEFINE command is malformed because we reached its end without
1477 ever hitting a "(" token. Transition back to general parsing. */
1478 s->state = S_GENERAL;
1481 else if (*type == SEG_PUNCT && input[0] == '(')
1483 s->state = S_DEFINE_2;
1492 segmenter_parse_define_2__ (struct segmenter *s,
1493 const char *input, size_t n, bool eof,
1494 enum segment_type *type)
1496 int ofs = segmenter_subparse (s, input, n, eof, type);
1500 if (*type == SEG_SEPARATE_COMMANDS
1501 || *type == SEG_END_COMMAND
1502 || *type == SEG_START_COMMAND)
1504 /* The DEFINE command is malformed because we reached its end before
1505 closing the set of parentheses. Transition back to general
1507 s->state = S_GENERAL;
1510 else if (*type == SEG_PUNCT && input[0] == '(')
1515 else if (*type == SEG_PUNCT && input[0] == ')')
1520 s->state = S_DEFINE_3;
1530 find_enddefine (struct substring input)
1532 size_t n = input.length;
1533 const struct substring enddefine = ss_cstr ("!ENDDEFINE");
1534 for (size_t i = 0; i + enddefine.length <= n; i++)
1535 if (input.string[i] == '!'
1536 && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine))
1541 /* We are in the body of a macro definition, looking for additional lines of
1542 the body or !ENDDEFINE. */
1544 segmenter_parse_define_3__ (struct segmenter *s,
1545 const char *input, size_t n, bool eof,
1546 enum segment_type *type)
1548 /* Gather a whole line. */
1549 const char *newline = memchr (input, '\n', n);
1550 int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
1556 /* Does the line contain !ENDDEFINE? */
1557 size_t end = find_enddefine (ss_buffer (input, ofs));
1558 if (end == SIZE_MAX)
1560 /* No !ENDDEFINE. We have a full line of macro body.
1562 The line might be blank, whether completely empty or just spaces and
1563 comments. That's OK: we need to report blank lines because they can
1566 However, if the first line of the macro body (the same line as the
1567 closing parenthesis in the argument definition) is blank, we just
1568 report it as spaces because it's not significant. */
1569 *type = (s->substate == 0 && is_all_spaces (input, ofs)
1570 ? SEG_SPACES : SEG_MACRO_BODY);
1571 s->state = S_DEFINE_4;
1577 /* Macro ends at the !ENDDEFINE on this line. */
1578 s->state = S_GENERAL;
1582 /* Line starts with !ENDDEFINE. */
1583 return segmenter_push (s, input, n, eof, type);
1587 if (is_all_spaces (input, end))
1589 /* Line starts with spaces followed by !ENDDEFINE. */
1594 /* Line starts with some content followed by !ENDDEFINE. */
1595 *type = SEG_MACRO_BODY;
1603 segmenter_parse_define_4__ (struct segmenter *s,
1604 const char *input, size_t n, bool eof,
1605 enum segment_type *type)
1607 int ofs = segmenter_parse_newline__ (input, n, eof, type);
1611 s->state = S_DEFINE_3;
1616 segmenter_parse_begin_data_1__ (struct segmenter *s,
1617 const char *input, size_t n, bool eof,
1618 enum segment_type *type)
1620 int ofs = segmenter_subparse (s, input, n, eof, type);
1624 if (*type == SEG_NEWLINE)
1625 s->state = S_BEGIN_DATA_2;
1631 segmenter_parse_begin_data_2__ (struct segmenter *s,
1632 const char *input, size_t n, bool eof,
1633 enum segment_type *type)
1635 int ofs = segmenter_subparse (s, input, n, eof, type);
1639 if (*type == SEG_NEWLINE)
1640 s->state = S_BEGIN_DATA_3;
1646 is_end_data (const char *input, size_t n)
1648 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1654 if (n < 4 || c_strncasecmp (input, "END", 3))
1658 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1659 if (!lex_uc_is_space (uc))
1663 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1670 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1677 else if (!lex_uc_is_space (uc))
1686 segmenter_parse_begin_data_3__ (struct segmenter *s,
1687 const char *input, size_t n, bool eof,
1688 enum segment_type *type)
1692 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1695 else if (is_end_data (input, ofs))
1697 s->state = S_GENERAL;
1698 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1699 return segmenter_push (s, input, n, eof, type);
1703 *type = SEG_INLINE_DATA;
1704 s->state = S_BEGIN_DATA_4;
1705 return input[ofs - 1] == '\n' ? 0 : ofs;
1710 segmenter_parse_begin_data_4__ (struct segmenter *s,
1711 const char *input, size_t n, bool eof,
1712 enum segment_type *type)
1716 ofs = segmenter_parse_newline__ (input, n, eof, type);
1720 s->state = S_BEGIN_DATA_3;
1725 segmenter_parse_title_1__ (struct segmenter *s,
1726 const char *input, size_t n, bool eof,
1727 enum segment_type *type)
1731 ofs = skip_spaces (input, n, eof, 0);
1734 s->state = S_TITLE_2;
1740 segmenter_parse_title_2__ (struct segmenter *s,
1741 const char *input, size_t n, bool eof,
1742 enum segment_type *type)
1754 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1768 if (!lex_uc_is_space (uc))
1779 s->state = S_GENERAL;
1781 *type = SEG_UNQUOTED_STRING;
1782 return endcmd >= 0 ? endcmd : ofs;
1788 /* Returns the name of segment TYPE as a string. The caller must not modify
1789 or free the returned string.
1791 This is useful only for debugging and testing. */
1793 segment_type_to_string (enum segment_type type)
1797 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1801 return "unknown segment type";
1805 /* Initializes S as a segmenter with the given syntax MODE.
1807 A segmenter does not contain any external references, so nothing needs to be
1808 done to destroy one. For the same reason, segmenters may be copied with
1809 plain struct assignment (or memcpy). */
1811 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1813 *s = (struct segmenter) SEGMENTER_INIT (mode);
1816 /* Returns the mode passed to segmenter_init() for S. */
1818 segmenter_get_mode (const struct segmenter *s)
1823 /* Attempts to label a prefix of S's remaining input with a segment type. The
1824 caller supplies the first N bytes of the remaining input as INPUT, which
1825 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1826 are the entire (remainder) of the input; if EOF is false, then further input
1827 is potentially available.
1829 The input may contain '\n' or '\r\n' line ends in any combination.
1831 If successful, returns the number of bytes in the segment at the beginning
1832 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1833 into *TYPE. The next call to segmenter_push() should not include those
1834 bytes as part of INPUT, because they have (figuratively) been consumed by
1837 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1838 be determined. In this case segmenter_push() returns -1. If more input is
1839 available, the caller should obtain some more, then call again with a larger
1840 N. If this is not enough, the process might need to repeat again and agin.
1841 If input is exhausted, then the caller may call again setting EOF to true.
1842 segmenter_push() will never return -1 when EOF is true.
1844 The caller must not, in a sequence of calls, supply contradictory input.
1845 That is, bytes provided as part of INPUT in one call, but not consumed, must
1846 not be provided with *different* values on subsequent calls. This is
1847 because segmenter_push() must often make decisions based on looking ahead
1848 beyond the bytes that it consumes. */
1850 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1851 enum segment_type *type)
1867 return segmenter_parse_shbang__ (s, input, n, eof, type);
1870 return (s->substate & SS_START_OF_LINE
1871 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1872 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1875 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1877 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1880 return segmenter_parse_document_1__ (s, input, n, eof, type);
1882 return segmenter_parse_document_2__ (s, input, n, eof, type);
1884 return segmenter_parse_document_3__ (s, type);
1887 return segmenter_parse_file_label__ (s, input, n, eof, type);
1890 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1892 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1894 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1897 return segmenter_parse_define_1__ (s, input, n, eof, type);
1899 return segmenter_parse_define_2__ (s, input, n, eof, type);
1901 return segmenter_parse_define_3__ (s, input, n, eof, type);
1903 return segmenter_parse_define_4__ (s, input, n, eof, type);
1905 case S_BEGIN_DATA_1:
1906 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1907 case S_BEGIN_DATA_2:
1908 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1909 case S_BEGIN_DATA_3:
1910 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1911 case S_BEGIN_DATA_4:
1912 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1915 return segmenter_parse_title_1__ (s, input, n, eof, type);
1917 return segmenter_parse_title_2__ (s, input, n, eof, type);
1923 /* Returns the style of command prompt to display to an interactive user for
1924 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1925 and at the beginning of a line (that is, if segmenter_push() consumed as
1926 much as possible of the input up to a new-line). */
1928 segmenter_get_prompt (const struct segmenter *s)
1933 return PROMPT_FIRST;
1936 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1940 return PROMPT_COMMENT;
1944 return PROMPT_DOCUMENT;
1946 return PROMPT_FIRST;
1949 return PROMPT_LATER;
1953 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1955 return PROMPT_DO_REPEAT;
1959 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1962 return PROMPT_DEFINE;
1964 case S_BEGIN_DATA_1:
1965 return PROMPT_FIRST;
1966 case S_BEGIN_DATA_2:
1967 return PROMPT_LATER;
1968 case S_BEGIN_DATA_3:
1969 case S_BEGIN_DATA_4:
1974 return PROMPT_FIRST;