S_DOCUMENT_1,
S_DOCUMENT_2,
S_DOCUMENT_3,
- S_FILE_LABEL,
+ S_FILE_LABEL_1,
+ S_FILE_LABEL_2,
+ S_FILE_LABEL_3,
S_DO_REPEAT_1,
S_DO_REPEAT_2,
S_DO_REPEAT_3,
+ S_DEFINE_1,
+ S_DEFINE_2,
+ S_DEFINE_3,
+ S_DEFINE_4,
S_BEGIN_DATA_1,
S_BEGIN_DATA_2,
S_BEGIN_DATA_3,
S_BEGIN_DATA_4,
- S_TITLE_1,
- S_TITLE_2
};
#define SS_START_OF_LINE (1u << 0)
return is_end_of_line (input, n, eof, ofs);
}
+static bool
+is_all_spaces (const char *input_, size_t n)
+{
+ const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
+
+ int mblen;
+ for (int ofs = 0; ofs < n; ofs += mblen)
+ {
+ ucs4_t uc;
+ mblen = u8_mbtouc (&uc, input + ofs, n - ofs);
+ if (!lex_uc_is_space (uc))
+ return false;
+ }
+ return true;
+}
+
static int
segmenter_parse_newline__ (const char *input, size_t n, bool eof,
enum segment_type *type)
}
/* fall through */
- case SEG_MACRO_ID:
case SEG_NUMBER:
case SEG_QUOTED_STRING:
case SEG_HEX_STRING:
case SEG_COMMENT_COMMAND:
case SEG_DO_REPEAT_COMMAND:
case SEG_INLINE_DATA:
+ case SEG_MACRO_ID:
+ case SEG_MACRO_BODY:
case SEG_START_DOCUMENT:
case SEG_DOCUMENT:
case SEG_START_COMMAND:
case SEG_END:
case SEG_EXPECTED_QUOTE:
case SEG_EXPECTED_EXPONENT:
- case SEG_UNEXPECTED_DOT:
case SEG_UNEXPECTED_CHAR:
id[0] = '\0';
return ofs + retval;
*type = SEG_START_DOCUMENT;
return 0;
}
- else if (lex_id_match (ss_cstr ("TITLE"), word)
- || lex_id_match (ss_cstr ("SUBTITLE"), word))
+ else if (lex_id_match_n (ss_cstr ("DEFINE"), word, 6))
{
- int result = segmenter_unquoted (input, n, eof, ofs);
- if (result < 0)
- return -1;
- else if (result)
- {
- s->state = S_TITLE_1;
- return ofs;
- }
+ s->state = S_DEFINE_1;
+ return ofs;
}
else if (lex_id_match (ss_cstr ("FILE"), word))
{
return -1;
else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
{
- s->state = S_FILE_LABEL;
+ s->state = S_FILE_LABEL_1;
s->substate = 0;
return ofs;
}
s->substate = SS_START_OF_COMMAND;
}
else
- *type = SEG_UNEXPECTED_DOT;
+ *type = SEG_PUNCT;
return 1;
case '0': case '1': case '2': case '3': case '4':
}
static int
-segmenter_parse_file_label__ (struct segmenter *s,
- const char *input, size_t n, bool eof,
- enum segment_type *type)
+segmenter_parse_file_label_1__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
{
struct segmenter sub;
int ofs;
else
{
if (result)
- s->state = S_TITLE_1;
+ s->state = S_FILE_LABEL_2;
else
*s = sub;
return ofs;
}
}
+static int
+segmenter_parse_file_label_2__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int ofs;
+
+ ofs = skip_spaces (input, n, eof, 0);
+ if (ofs < 0)
+ return -1;
+ s->state = S_FILE_LABEL_3;
+ *type = SEG_SPACES;
+ return ofs;
+}
+
+static int
+segmenter_parse_file_label_3__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int endcmd;
+ int ofs;
+
+ endcmd = -1;
+ ofs = 0;
+ while (ofs < n)
+ {
+ ucs4_t uc;
+ int mblen;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
+ if (mblen < 0)
+ return -1;
+
+ switch (uc)
+ {
+ case '\n':
+ goto end_of_line;
+
+ case '.':
+ endcmd = ofs;
+ break;
+
+ default:
+ if (!lex_uc_is_space (uc))
+ endcmd = -1;
+ break;
+ }
+
+ ofs += mblen;
+ }
+
+ if (eof)
+ {
+ end_of_line:
+ s->state = S_GENERAL;
+ s->substate = 0;
+ *type = SEG_UNQUOTED_STRING;
+ return endcmd >= 0 ? endcmd : ofs;
+ }
+
+ return -1;
+}
+
static int
segmenter_subparse (struct segmenter *s,
const char *input, size_t n, bool eof,
}
}
+/* We are segmenting a DEFINE command, which consists of:
+
+ - The DEFINE keyword.
+
+ - Anything but "(".
+
+ - "(" followed by a sequence of tokens possibly including balanced parentheses
+ up to a final ")".
+
+ - A sequence of any number of lines, one string per line, ending with
+ "!ENDDEFINE". The first line is usually blank (that is, a newline follows
+ the "("). The last line usually just has "!ENDDEFINE." on it, but it can
+ start with other tokens. The whole DEFINE...!ENDDEFINE can be on a single
+ line, even.
+ */
+static int
+segmenter_parse_define_1__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, eof, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_SEPARATE_COMMANDS
+ || *type == SEG_END_COMMAND
+ || *type == SEG_START_COMMAND)
+ {
+ /* The DEFINE command is malformed because we reached its end without
+ ever hitting a "(" token. Transition back to general parsing. */
+ s->state = S_GENERAL;
+ return ofs;
+ }
+ else if (*type == SEG_PUNCT && input[0] == '(')
+ {
+ s->state = S_DEFINE_2;
+ s->nest = 1;
+ return ofs;
+ }
+
+ return ofs;
+}
+
+static int
+segmenter_parse_define_2__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, eof, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_SEPARATE_COMMANDS
+ || *type == SEG_END_COMMAND
+ || *type == SEG_START_COMMAND)
+ {
+ /* The DEFINE command is malformed because we reached its end before
+ closing the set of parentheses. Transition back to general
+ parsing. */
+ s->state = S_GENERAL;
+ return ofs;
+ }
+ else if (*type == SEG_PUNCT && input[0] == '(')
+ {
+ s->nest++;
+ return ofs;
+ }
+ else if (*type == SEG_PUNCT && input[0] == ')')
+ {
+ s->nest--;
+ if (!s->nest)
+ {
+ s->state = S_DEFINE_3;
+ s->substate = 0;
+ }
+ return ofs;
+ }
+
+ return ofs;
+}
+
+static size_t
+find_enddefine (struct substring input)
+{
+ size_t n = input.length;
+ const struct substring enddefine = ss_cstr ("!ENDDEFINE");
+ for (int ofs = 0;;)
+ {
+ /* Skip !ENDDEFINE in comment. */
+ ofs = skip_spaces_and_comments (input.string, n, true, ofs);
+ if (ofs + enddefine.length > n)
+ return SIZE_MAX;
+
+ char c = input.string[ofs];
+ if (c == '!'
+ && ss_equals_case (ss_substr (input, ofs, enddefine.length),
+ enddefine))
+ return ofs;
+ else if (c == '\'' || c == '"')
+ {
+ /* Skip quoted !ENDDEFINE. */
+ ofs++;
+ for (;;)
+ {
+ if (ofs >= n)
+ return SIZE_MAX;
+ else if (input.string[ofs++] == c)
+ break;
+ }
+ }
+ else
+ ofs++;
+ }
+}
+
+/* We are in the body of a macro definition, looking for additional lines of
+ the body or !ENDDEFINE. */
+static int
+segmenter_parse_define_3__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ /* Gather a whole line. */
+ const char *newline = memchr (input, '\n', n);
+ int ofs = (newline ? newline - input - (newline > input && newline[-1] == '\r')
+ : eof ? n
+ : -1);
+ if (ofs < 0)
+ return -1;
+
+ /* Does the line contain !ENDDEFINE? */
+ size_t end = find_enddefine (ss_buffer (input, ofs));
+ if (end == SIZE_MAX)
+ {
+ /* No !ENDDEFINE. We have a full line of macro body.
+
+ The line might be blank, whether completely empty or just spaces and
+ comments. That's OK: we need to report blank lines because they can
+ have significance.
+
+ However, if the first line of the macro body (the same line as the
+ closing parenthesis in the argument definition) is blank, we just
+ report it as spaces because it's not significant. */
+ *type = (s->substate == 0 && is_all_spaces (input, ofs)
+ ? SEG_SPACES : SEG_MACRO_BODY);
+ s->state = S_DEFINE_4;
+ s->substate = 1;
+ return ofs;
+ }
+ else
+ {
+ /* Macro ends at the !ENDDEFINE on this line. */
+ s->state = S_GENERAL;
+ s->substate = 0;
+ if (!end)
+ {
+ /* Line starts with !ENDDEFINE. */
+ return segmenter_push (s, input, n, eof, type);
+ }
+ else
+ {
+ if (is_all_spaces (input, end))
+ {
+ /* Line starts with spaces followed by !ENDDEFINE. */
+ *type = SEG_SPACES;
+ }
+ else
+ {
+ /* Line starts with some content followed by !ENDDEFINE. */
+ *type = SEG_MACRO_BODY;
+ }
+ return end;
+ }
+ }
+}
+
+static int
+segmenter_parse_define_4__ (struct segmenter *s,
+ const char *input, size_t n, bool eof,
+ enum segment_type *type)
+{
+ int ofs = segmenter_parse_newline__ (input, n, eof, type);
+ if (ofs < 0)
+ return -1;
+
+ s->state = S_DEFINE_3;
+ return ofs;
+}
+
static int
segmenter_parse_begin_data_1__ (struct segmenter *s,
const char *input, size_t n, bool eof,
return ofs;
}
-static int
-segmenter_parse_title_1__ (struct segmenter *s,
- const char *input, size_t n, bool eof,
- enum segment_type *type)
-{
- int ofs;
-
- ofs = skip_spaces (input, n, eof, 0);
- if (ofs < 0)
- return -1;
- s->state = S_TITLE_2;
- *type = SEG_SPACES;
- return ofs;
-}
-
-static int
-segmenter_parse_title_2__ (struct segmenter *s,
- const char *input, size_t n, bool eof,
- enum segment_type *type)
-{
- int endcmd;
- int ofs;
-
- endcmd = -1;
- ofs = 0;
- while (ofs < n)
- {
- ucs4_t uc;
- int mblen;
-
- mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
- if (mblen < 0)
- return -1;
-
- switch (uc)
- {
- case '\n':
- goto end_of_line;
-
- case '.':
- endcmd = ofs;
- break;
-
- default:
- if (!lex_uc_is_space (uc))
- endcmd = -1;
- break;
- }
-
- ofs += mblen;
- }
-
- if (eof)
- {
- end_of_line:
- s->state = S_GENERAL;
- s->substate = 0;
- *type = SEG_UNQUOTED_STRING;
- return endcmd >= 0 ? endcmd : ofs;
- }
-
- return -1;
-}
-
/* Returns the name of segment TYPE as a string. The caller must not modify
or free the returned string.
}
}
-/* Initializes S as a segmenter with the given syntax MODE.
+/* Returns a segmenter with the given syntax MODE.
+
+ If IS_SNIPPET is false, then the segmenter will parse as if it's being given
+ a whole file. This means, for example, that it will interpret - or + at the
+ beginning of the syntax as a separator between commands (since - or + at the
+ beginning of a line has this meaning).
+
+ If IS_SNIPPET is true, then the segmenter will parse as if it's being given
+ an isolated piece of syntax. This means that, for example, that it will
+ interpret - or + at the beginning of the syntax as an operator token or (if
+ followed by a digit) as part of a number.
A segmenter does not contain any external references, so nothing needs to be
done to destroy one. For the same reason, segmenters may be copied with
plain struct assignment (or memcpy). */
-void
-segmenter_init (struct segmenter *s, enum segmenter_mode mode)
+struct segmenter
+segmenter_init (enum segmenter_mode mode, bool is_snippet)
{
- s->state = S_SHBANG;
- s->substate = 0;
- s->mode = mode;
+ return (struct segmenter) {
+ .state = is_snippet ? S_GENERAL : S_SHBANG,
+ .mode = mode,
+ };
}
/* Returns the mode passed to segmenter_init() for S. */
case S_DOCUMENT_3:
return segmenter_parse_document_3__ (s, type);
- case S_FILE_LABEL:
- return segmenter_parse_file_label__ (s, input, n, eof, type);
+ case S_FILE_LABEL_1:
+ return segmenter_parse_file_label_1__ (s, input, n, eof, type);
+ case S_FILE_LABEL_2:
+ return segmenter_parse_file_label_2__ (s, input, n, eof, type);
+ case S_FILE_LABEL_3:
+ return segmenter_parse_file_label_3__ (s, input, n, eof, type);
case S_DO_REPEAT_1:
return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
case S_DO_REPEAT_3:
return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
+ case S_DEFINE_1:
+ return segmenter_parse_define_1__ (s, input, n, eof, type);
+ case S_DEFINE_2:
+ return segmenter_parse_define_2__ (s, input, n, eof, type);
+ case S_DEFINE_3:
+ return segmenter_parse_define_3__ (s, input, n, eof, type);
+ case S_DEFINE_4:
+ return segmenter_parse_define_4__ (s, input, n, eof, type);
+
case S_BEGIN_DATA_1:
return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
case S_BEGIN_DATA_2:
return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
case S_BEGIN_DATA_4:
return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
-
- case S_TITLE_1:
- return segmenter_parse_title_1__ (s, input, n, eof, type);
- case S_TITLE_2:
- return segmenter_parse_title_2__ (s, input, n, eof, type);
}
NOT_REACHED ();
case S_DOCUMENT_3:
return PROMPT_FIRST;
- case S_FILE_LABEL:
+ case S_FILE_LABEL_1:
return PROMPT_LATER;
+ case S_FILE_LABEL_2:
+ case S_FILE_LABEL_3:
+ return PROMPT_FIRST;
case S_DO_REPEAT_1:
case S_DO_REPEAT_2:
case S_DO_REPEAT_3:
return PROMPT_DO_REPEAT;
+ case S_DEFINE_1:
+ case S_DEFINE_2:
+ return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
+ case S_DEFINE_3:
+ case S_DEFINE_4:
+ return PROMPT_DEFINE;
+
case S_BEGIN_DATA_1:
return PROMPT_FIRST;
case S_BEGIN_DATA_2:
case S_BEGIN_DATA_4:
return PROMPT_DATA;
- case S_TITLE_1:
- case S_TITLE_2:
- return PROMPT_FIRST;
}
NOT_REACHED ();