S_DO_REPEAT_1,
S_DO_REPEAT_2,
S_DO_REPEAT_3,
+ S_DEFINE_1,
+ S_DEFINE_2,
+ S_DEFINE_3,
+ S_DEFINE_4,
S_BEGIN_DATA_1,
S_BEGIN_DATA_2,
S_BEGIN_DATA_3,
{
if (input[1] == '!')
{
- int ofs;
-
- for (ofs = 2; ofs < n; ofs++)
- if (input[ofs] == '\n')
- {
- if (input[ofs] == '\n' && input[ofs - 1] == '\r')
- ofs--;
-
- s->state = S_GENERAL;
- s->substate = SS_START_OF_COMMAND;
- *type = SEG_SHBANG;
- return ofs;
- }
+ for (int ofs = 2; ; ofs++)
+ {
+ if (ofs >= n)
+ {
+ if (!eof)
+ return -1;
+ }
+ else if (input[ofs] == '\n')
+ {
+ if (input[ofs - 1] == '\r')
+ ofs--;
+ }
+ else
+ continue;
- return eof ? ofs : -1;
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_COMMAND;
+ *type = SEG_SHBANG;
+ return ofs;
+ }
}
}
else if (!eof)
return is_end_of_line (input, n, eof, ofs);
}
+static bool
+is_all_spaces (const char *input_, size_t n)
+{
+ const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
+
+ int mblen;
+ for (int ofs = 0; ofs < n; ofs += mblen)
+ {
+ ucs4_t uc;
+ mblen = u8_mbtouc (&uc, input, n);
+ if (!lex_uc_is_space (uc))
+ return false;
+ }
+ return true;
+}
+
static int
segmenter_parse_newline__ (const char *input, size_t n, bool eof,
enum segment_type *type)
if (!eof)
return -1;
goto number;
- };
+ }
if (input[ofs] == '.')
{
+ if (ofs + 1 >= n)
+ {
+ if (!eof)
+ return -1;
+ goto number;
+ }
+
ofs = skip_digits (input, n, eof, ofs + 1);
if (ofs < 0)
return -1;
+ else if (ofs >= n)
+ goto number;
}
- if (ofs >= n)
- {
- if (!eof)
- return -1;
- goto number;
- }
if (input[ofs] == 'e' || input[ofs] == 'E')
{
ofs++;
case SEG_COMMENT_COMMAND:
case SEG_DO_REPEAT_COMMAND:
case SEG_INLINE_DATA:
+ case SEG_MACRO_ID:
+ case SEG_MACRO_BODY:
case SEG_START_DOCUMENT:
case SEG_DOCUMENT:
case SEG_START_COMMAND:
case SEG_END:
case SEG_EXPECTED_QUOTE:
case SEG_EXPECTED_EXPONENT:
- case SEG_UNEXPECTED_DOT:
case SEG_UNEXPECTED_CHAR:
id[0] = '\0';
return ofs + retval;
ofs--;
}
- if (is_reserved_word (input, ofs))
- *type = SEG_RESERVED_WORD;
- else
- *type = SEG_IDENTIFIER;
+ *type = (is_reserved_word (input, ofs) ? SEG_RESERVED_WORD
+ : input[0] == '!' ? SEG_MACRO_ID
+ : SEG_IDENTIFIER);
if (s->substate & SS_START_OF_COMMAND)
{
return ofs;
}
}
+ else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
+ {
+ s->state = S_DEFINE_1;
+ return ofs;
+ }
else if (lex_id_match (ss_cstr ("FILE"), word))
{
char id[16];
s->substate = SS_START_OF_COMMAND;
}
else
- *type = SEG_UNEXPECTED_DOT;
+ *type = SEG_PUNCT;
return 1;
case '0': case '1': case '2': case '3': case '4':
return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
s, input, n, eof, type);
+ case '!':
+ return segmenter_parse_id__ (s, input, n, eof, type);
+
default:
if (lex_uc_is_space (uc))
{
}
else if (lex_uc_is_id1 (uc))
return segmenter_parse_id__ (s, input, n, eof, type);
+ else if (uc > 32 && uc < 127 && uc != '\\' && uc != '^')
+ {
+ *type = SEG_PUNCT;
+ s->substate = 0;
+ return 1;
+ }
else
{
*type = SEG_UNEXPECTED_CHAR;
return ofs;
}
+/* We are segmenting a DO REPEAT command, currently reading the syntax that
+ defines the stand-in variables (the head) before the lines of syntax to be
+ repeated (the body). */
static int
segmenter_parse_do_repeat_1__ (struct segmenter *s,
const char *input, size_t n, bool eof,
if (ofs < 0)
return -1;
- if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
- s->state = S_DO_REPEAT_2;
- else if (*type == SEG_END_COMMAND)
+ if (*type == SEG_SEPARATE_COMMANDS)
{
+ /* We reached a blank line that separates the head from the body. */
+ s->state = S_DO_REPEAT_2;
+ }
+ else if (*type == SEG_END_COMMAND || *type == SEG_START_COMMAND)
+ {
+ /* We reached the body. */
s->state = S_DO_REPEAT_3;
s->substate = 1;
}
return ofs;
}
+/* We are segmenting a DO REPEAT command, currently reading a blank line that
+ separates the head from the body. */
static int
segmenter_parse_do_repeat_2__ (struct segmenter *s,
const char *input, size_t n, bool eof,
if (*type == SEG_NEWLINE)
{
+ /* We reached the body. */
s->state = S_DO_REPEAT_3;
s->substate = 1;
}
return ofs - (input[ofs - 1] == '\r');
}
+/* We are in the body of DO REPEAT, segmenting the lines of syntax that are to
+ be repeated. Report each line of syntax as a single SEG_DO_REPEAT_COMMAND.
+
+ DO REPEAT can be nested, so we look for DO REPEAT...END REPEAT blocks inside
+ the lines we're segmenting. s->substate counts the nesting level, starting
+ at 1. */
static int
segmenter_parse_do_repeat_3__ (struct segmenter *s,
const char *input, size_t n, bool eof,
return -1;
else if (s->substate == 0)
{
+ /* Nesting level dropped to 0, so we've finished reading the DO REPEAT
+ body. */
s->state = S_GENERAL;
s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
return segmenter_push (s, input, n, eof, type);
}
}
+/* We are segmenting a DEFINE command, which consists of:
+
+ - The DEFINE keyword.
+
+ - Anything but "(".
+
+ - "(" followed by a sequence of tokens possibly including balanced parentheses
+ up to a final ")".
+
+ - A sequence of any number of lines, one string per line, ending with
+ "!ENDDEFINE". The first line is usually blank (that is, a newline follows
+ the "("). The last line usually just has "!ENDDEFINE." on it, but it can
+ start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
+ line, even.
+ */
+static int
+segmenter_parse_define_1__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, eof, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_SEPARATE_COMMANDS
+ || *type == SEG_END_COMMAND
+ || *type == SEG_START_COMMAND)
+ {
+ /* The DEFINE command is malformed because we reached its end without
+ ever hitting a "(" token. Transition back to general parsing. */
+ s->state = S_GENERAL;
+ return ofs;
+ }
+ else if (*type == SEG_PUNCT && input[0] == '(')
+ {
+ s->state = S_DEFINE_2;
+ s->nest = 1;
+ return ofs;
+ }
+
+ return ofs;
+}
+
+static int
+segmenter_parse_define_2__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, eof, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_SEPARATE_COMMANDS
+ || *type == SEG_END_COMMAND
+ || *type == SEG_START_COMMAND)
+ {
+ /* The DEFINE command is malformed because we reached its end before
+ closing the set of parentheses. Transition back to general
+ parsing. */
+ s->state = S_GENERAL;
+ return ofs;
+ }
+ else if (*type == SEG_PUNCT && input[0] == '(')
+ {
+ s->nest++;
+ return ofs;
+ }
+ else if (*type == SEG_PUNCT && input[0] == ')')
+ {
+ s->nest--;
+ if (!s->nest)
+ s->state = S_DEFINE_3;
+ return ofs;
+ }
+
+ return ofs;
+}
+
+static size_t
+find_enddefine (struct substring input)
+{
+ size_t n = input.length;
+ const struct substring enddefine = ss_cstr ("!ENDDEFINE");
+ for (size_t i = 0; i + enddefine.length <= n; i++)
+ if (input.string[i] == '!'
+ && ss_equals_case (ss_substr (input, i, enddefine.length), enddefine))
+ return i;
+ return SIZE_MAX;
+}
+
+/* We are in the body of a macro definition, looking for additional lines of
+ the body or !ENDDEFINE. */
+static int
+segmenter_parse_define_3__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ /* Gather a whole line. */
+ const char *newline = memchr (input, '\n', n);
+ int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
+ : eof ? n
+ : -1);
+ if (ofs < 0)
+ return -1;
+
+ /* Does the line contain !ENDDEFINE? */
+ size_t end = find_enddefine (ss_buffer (input, ofs));
+ if (end == SIZE_MAX)
+ {
+ /* No !ENDDEFINE. We have a full line of macro body.
+
+ The line might be blank, whether completely empty or just spaces and
+ comments. That's OK: we need to report blank lines because they can
+ have significance. */
+ *type = SEG_MACRO_BODY;
+ s->state = S_DEFINE_4;
+ return ofs;
+ }
+ else
+ {
+ /* Macro ends at the !ENDDEFINE on this line. */
+ s->state = S_GENERAL;
+ s->substate = 0;
+ if (!end)
+ {
+ /* Line starts with !ENDDEFINE. */
+ return segmenter_push (s, input, n, eof, type);
+ }
+ else
+ {
+ if (is_all_spaces (input, end))
+ {
+ /* Line starts with spaces followed by !ENDDEFINE. */
+ *type = SEG_SPACES;
+ }
+ else
+ {
+ /* Line starts with some content followed by !ENDDEFINE. */
+ *type = SEG_MACRO_BODY;
+ }
+ return end;
+ }
+ }
+}
+
+static int
+segmenter_parse_define_4__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int ofs = segmenter_parse_newline__ (input, n, eof, type);
+ if (ofs < 0)
+ return -1;
+
+ s->state = S_DEFINE_3;
+ return ofs;
+}
+
static int
segmenter_parse_begin_data_1__ (struct segmenter *s,
const char *input, size_t n, bool eof,
case S_DO_REPEAT_3:
return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
+ case S_DEFINE_1:
+ return segmenter_parse_define_1__ (s, input, n, eof, type);
+ case S_DEFINE_2:
+ return segmenter_parse_define_2__ (s, input, n, eof, type);
+ case S_DEFINE_3:
+ return segmenter_parse_define_3__ (s, input, n, eof, type);
+ case S_DEFINE_4:
+ return segmenter_parse_define_4__ (s, input, n, eof, type);
+
case S_BEGIN_DATA_1:
return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
case S_BEGIN_DATA_2:
case S_DO_REPEAT_3:
return PROMPT_DO_REPEAT;
+ case S_DEFINE_1:
+ case S_DEFINE_2:
+ return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
+ case S_DEFINE_3:
+ case S_DEFINE_4:
+ return PROMPT_DEFINE;
+
case S_BEGIN_DATA_1:
return PROMPT_FIRST;
case S_BEGIN_DATA_2: