/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <unistr.h>
#include <uniwidth.h>
-#include "data/file-name.h"
#include "language/command.h"
#include "language/lexer/scan.h"
#include "language/lexer/segment.h"
{
reader->class = class;
reader->syntax = LEX_SYNTAX_AUTO;
- reader->error = LEX_ERROR_INTERACTIVE;
+ reader->error = LEX_ERROR_CONTINUE;
reader->file_name = NULL;
+ reader->encoding = NULL;
reader->line_number = 0;
}
ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
}
\f
-/* Advacning. */
+/* Advancing. */
static struct lex_token *
lex_push_token__ (struct lex_source *src)
va_end (args);
}
-/* Reports an error to the effect that subcommand SBC may only be
- specified once. */
+/* Prints a syntax error message saying that OPTION0 or one of the other
+ strings following it, up to the first NULL, is expected. */
+void
+lex_error_expecting (struct lexer *lexer, const char *option0, ...)
+{
+ enum { MAX_OPTIONS = 8 };
+ const char *options[MAX_OPTIONS + 1];
+ va_list args;
+ int n;
+
+ va_start (args, option0);
+ options[0] = option0;
+ n = 0;
+ while (n + 1 < MAX_OPTIONS && options[n] != NULL)
+ options[++n] = va_arg (args, const char *);
+ va_end (args);
+
+ switch (n)
+ {
+ case 0:
+ lex_error (lexer, NULL);
+ break;
+
+ case 1:
+ lex_error (lexer, _("expecting %s"), options[0]);
+ break;
+
+ case 2:
+ lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
+ break;
+
+ case 3:
+ lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
+ options[2]);
+ break;
+
+ case 4:
+ lex_error (lexer, _("expecting %s, %s, %s, or %s"),
+ options[0], options[1], options[2], options[3]);
+ break;
+
+ case 5:
+ lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
+ options[0], options[1], options[2], options[3], options[4]);
+ break;
+
+ case 6:
+ lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
+ options[0], options[1], options[2], options[3], options[4],
+ options[5]);
+ break;
+
+ case 7:
+ lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
+ options[0], options[1], options[2], options[3], options[4],
+ options[5], options[6]);
+ break;
+
+ case 8:
+ lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
+ options[0], options[1], options[2], options[3], options[4],
+ options[5], options[6], options[7]);
+ break;
+
+ default:
+ NOT_REACHED ();
+ }
+}
+
+/* Reports an error to the effect that subcommand SBC may only be specified
+ once.
+
+ This function does not take a lexer as an argument or use lex_error(),
+ because the result would ordinarily just be redundant: "Syntax error at
+ SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
+ not help the user find the error. */
void
lex_sbc_only_once (const char *sbc)
{
msg (SE, _("Subcommand %s may only be specified once."), sbc);
}
-/* Reports an error to the effect that subcommand SBC is
- missing. */
+/* Reports an error to the effect that subcommand SBC is missing.
+
+ This function does not take a lexer as an argument or use lex_error(),
+ because a missing subcommand can normally be detected only after the whole
+ command has been parsed, and so lex_error() would always report "Syntax
+ error at end of command", which does not help the user find the error. */
+void
+lex_sbc_missing (const char *sbc)
+{
+ msg (SE, _("Required subcommand %s was not specified."), sbc);
+}
+
+/* Reports an error to the effect that specification SPEC may only be specified
+ once within subcommand SBC. */
+void
+lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
+{
+ lex_error (lexer, _("%s may only be specified once within subcommand %s"),
+ spec, sbc);
+}
+
+/* Reports an error to the effect that specification SPEC is missing within
+ subcommand SBC. */
void
-lex_sbc_missing (struct lexer *lexer, const char *sbc)
+lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
{
- lex_error (lexer, _("missing required subcommand %s"), sbc);
+ lex_error (lexer, _("Required %s specification missing from %s subcommand"),
+ sbc, spec);
}
/* Prints a syntax error message containing the current token and
return true;
else
{
- lex_error (lexer, _("expecting `%s'"), identifier);
+ lex_error_expecting (lexer, identifier, NULL_SENTINEL);
return false;
}
}
}
else
{
- lex_error (lexer, _("expecting `%s'"), token_type_to_string (type));
+ char *s = xasprintf ("`%s'", token_type_to_string (type));
+ lex_error_expecting (lexer, s, NULL_SENTINEL);
+ free (s);
return false;
}
}
}
}
+/* If the current token is a string or an identifier, does nothing and returns
+ true. Otherwise, reports an error and returns false.
+
+ This is meant for use in syntactic situations where we want to encourage the
+ user to supply a quoted string, but for compatibility we also accept
+ identifiers. (One example of such a situation is file names.) Therefore,
+ the error message issued when the current token is wrong only says that a
+ string is expected and doesn't mention that an identifier would also be
+ accepted. */
+bool
+lex_force_string_or_id (struct lexer *lexer)
+{
+ return lex_is_integer (lexer) || lex_force_string (lexer);
+}
+
/* If the current token is an integer, does nothing and returns true.
Otherwise, reports an error and returns false. */
bool
return lex_next (lexer, n)->string;
}
-/* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
- true. Otherwise, returns false.
-
- S may consist of an arbitrary number of identifiers, integers, and
- punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
- Identifiers may be abbreviated to their first three letters. Currently only
- hyphens, slashes, and equals signs are supported as punctuation (but it
- would be easy to add more).
-
- S must be an ASCII string. */
-bool
-lex_match_phrase (struct lexer *lexer, const char *s)
+static bool
+lex_tokens_match (const struct token *actual, const struct token *expected)
{
- int tok_idx;
+ if (actual->type != expected->type)
+ return false;
- for (tok_idx = 0; ; tok_idx++)
+ switch (actual->type)
{
- enum token_type token;
- unsigned char c;
+ case T_POS_NUM:
+ case T_NEG_NUM:
+ return actual->number == expected->number;
- while (c_isspace (*s))
- s++;
+ case T_ID:
+ return lex_id_match (expected->string, actual->string);
- c = *s;
- if (c == '\0')
- {
- int i;
+ case T_STRING:
+ return (actual->string.length == expected->string.length
+ && !memcmp (actual->string.string, expected->string.string,
+ actual->string.length));
- for (i = 0; i < tok_idx; i++)
- lex_get (lexer);
- return true;
- }
-
- token = lex_next_token (lexer, tok_idx);
- switch (c)
- {
- case '-':
- if (token != T_DASH)
- return false;
- s++;
- break;
-
- case '/':
- if (token != T_SLASH)
- return false;
- s++;
- break;
-
- case '=':
- if (token != T_EQUALS)
- return false;
- s++;
- break;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- {
- unsigned int value;
-
- if (token != T_POS_NUM)
- return false;
-
- value = 0;
- do
- {
- value = value * 10 + (*s++ - '0');
- }
- while (c_isdigit (*s));
-
- if (lex_next_tokval (lexer, tok_idx) != value)
- return false;
- }
- break;
+ default:
+ return true;
+ }
+}
- default:
- if (lex_is_id1 (c))
- {
- int len;
+/* If LEXER is positioned at the sequence of tokens that may be parsed from S,
+ skips it and returns true. Otherwise, returns false.
- if (token != T_ID)
- return false;
+ S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
+ "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
+ first three letters. */
+bool
+lex_match_phrase (struct lexer *lexer, const char *s)
+{
+ struct string_lexer slex;
+ struct token token;
+ int i;
- len = lex_id_get_length (ss_cstr (s));
- if (!lex_id_match (ss_buffer (s, len),
- lex_next_tokss (lexer, tok_idx)))
- return false;
+ i = 0;
+ string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE);
+ while (string_lexer_next (&slex, &token))
+ if (token.type != SCAN_SKIP)
+ {
+ bool match = lex_tokens_match (lex_next (lexer, i++), &token);
+ token_destroy (&token);
+ if (!match)
+ return false;
+ }
- s += len;
- }
- else
- NOT_REACHED ();
- }
- }
+ while (i-- > 0)
+ lex_get (lexer);
+ return true;
}
static int
return src == NULL ? NULL : src->reader->file_name;
}
+const char *
+lex_get_encoding (const struct lexer *lexer)
+{
+ struct lex_source *src = lex_source__ (lexer);
+ return src == NULL ? NULL : src->reader->encoding;
+}
+
+
/* Returns the syntax mode for the syntax file from which the current drawn is
drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
source does not have line numbers.
}
/* Returns the error mode for the syntax file from which the current drawn is
- drawn. Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
+ drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
source does not have line numbers.
There is no version of this function that takes an N argument because
lex_get_error_mode (const struct lexer *lexer)
{
struct lex_source *src = lex_source__ (lexer);
- return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
+ return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
}
/* If the source that LEXER is currently reading has error mode
- LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
- next token to be read comes directly from whatever is next read from the
- stream.
+ LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
+ token to be read comes directly from whatever is next read from the stream.
It makes sense to call this function after encountering an error in a
command entered on the console, because usually the user would prefer not to
lex_interactive_reset (struct lexer *lexer)
{
struct lex_source *src = lex_source__ (lexer);
- if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
+ if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
{
src->head = src->tail = 0;
src->journal_pos = src->seg_pos = src->line_pos = 0;
}
/* Discards all lookahead tokens in LEXER, then discards all input sources
- until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
+ until it encounters one with error mode LEX_ERROR_TERMINAL or until it
runs out of input sources. */
void
lex_discard_noninteractive (struct lexer *lexer)
while (!deque_is_empty (&src->deque))
lex_source_pop__ (src);
- for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
+ for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
src = lex_source__ (lexer))
lex_source_destroy (src);
}
do
{
size_t head_ofs;
+ size_t space;
size_t n;
lex_source_expand__ (src);
head_ofs = src->head - src->tail;
+ space = src->allocated - head_ofs;
n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
- src->allocated - head_ofs,
+ space,
segmenter_get_prompt (&src->segmenter));
+ assert (n <= space);
+
if (n == 0)
{
/* End of input.
va_end (args);
}
+/* Attempts to append an additional token into SRC's deque, reading more from
+ the underlying lex_reader if necessary.. Returns true if successful, false
+ if the deque already represents (a suffix of) the whole lex_reader's
+ contents, */
static bool
lex_source_get__ (const struct lex_source *src_)
{
struct lex_source *src = CONST_CAST (struct lex_source *, src_);
+ if (src->eof)
+ return false;
+ /* State maintained while scanning tokens. Usually we only need a single
+ state, but scanner_push() can return SCAN_SAVE to indicate that the state
+ needs to be saved and possibly restored later with SCAN_BACK. */
struct state
{
struct segmenter segmenter;
enum segment_type last_segment;
- int newlines;
+ int newlines; /* Number of newlines encountered so far. */
+ /* Maintained here so we can update lex_source's similar members when we
+ finish. */
size_t line_pos;
size_t seg_pos;
};
- struct state state, saved;
- enum scan_result result;
- struct scanner scanner;
- struct lex_token *token;
- int n_lines;
- int i;
-
- if (src->eof)
- return false;
-
- state.segmenter = src->segmenter;
- state.newlines = 0;
- state.seg_pos = src->seg_pos;
- state.line_pos = src->line_pos;
- saved = state;
+ /* Initialize state. */
+ struct state state =
+ {
+ .segmenter = src->segmenter,
+ .newlines = 0,
+ .seg_pos = src->seg_pos,
+ .line_pos = src->line_pos,
+ };
+ struct state saved = state;
- token = lex_push_token__ (src);
+ /* Append a new token to SRC and initialize it. */
+ struct lex_token *token = lex_push_token__ (src);
+ struct scanner scanner;
scanner_init (&scanner, &token->token);
token->line_pos = src->line_pos;
token->token_pos = src->seg_pos;
else
token->first_line = 0;
+ /* Extract segments and pass them through the scanner until we obtain a
+ token. */
for (;;)
{
+ /* Extract a segment. */
+ const char *segment = &src->buffer[state.seg_pos - src->tail];
+ size_t seg_maxlen = src->head - state.seg_pos;
enum segment_type type;
- const char *segment;
- size_t seg_maxlen;
- int seg_len;
-
- segment = &src->buffer[state.seg_pos - src->tail];
- seg_maxlen = src->head - state.seg_pos;
- seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
+ int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
+ &type);
if (seg_len < 0)
{
+ /* The segmenter needs more input to produce a segment. */
lex_source_read__ (src);
continue;
}
+ /* Update state based on the segment. */
state.last_segment = type;
state.seg_pos += seg_len;
if (type == SEG_NEWLINE)
state.line_pos = state.seg_pos;
}
- result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
- &token->token);
+ /* Pass the segment into the scanner and try to get a token out. */
+ enum scan_result result = scanner_push (&scanner, type,
+ ss_buffer (segment, seg_len),
+ &token->token);
if (result == SCAN_SAVE)
saved = state;
else if (result == SCAN_BACK)
break;
}
- n_lines = state.newlines;
+ /* If we've reached the end of a line, or the end of a command, then pass
+ the line to the output engine as a syntax text item. */
+ int n_lines = state.newlines;
if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
{
n_lines++;
n_lines--;
src->suppress_next_newline = false;
}
- for (i = 0; i < n_lines; i++)
+ for (int i = 0; i < n_lines; i++)
{
- const char *newline;
- const char *line;
- size_t line_len;
-
- line = &src->buffer[src->journal_pos - src->tail];
- newline = rawmemchr (line, '\n');
- line_len = newline - line;
+ const char *line = &src->buffer[src->journal_pos - src->tail];
+ const char *newline = rawmemchr (line, '\n');
+ size_t line_len = newline - line;
if (line_len > 0 && line[line_len - 1] == '\r')
line_len--;
- text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
- xmemdup0 (line, line_len)));
+ char *syntax = malloc (line_len + 2);
+ memcpy (syntax, line, line_len);
+ syntax[line_len] = '\n';
+ syntax[line_len + 1] = '\0';
+
+ text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
src->journal_pos += newline - line + 1;
}
lex_source_destroy (struct lex_source *src)
{
char *file_name = src->reader->file_name;
- if (src->reader->class->close != NULL)
- src->reader->class->close (src->reader);
+ char *encoding = src->reader->encoding;
+ if (src->reader->class->destroy != NULL)
+ src->reader->class->destroy (src->reader);
free (file_name);
+ free (encoding);
free (src->buffer);
while (!deque_is_empty (&src->deque))
lex_source_pop__ (src);
{
struct lex_reader reader;
struct u8_istream *istream;
- char *file_name;
};
static struct lex_reader_class lex_file_reader_class;
r->reader.syntax = syntax;
r->reader.error = error;
r->reader.file_name = xstrdup (file_name);
+ r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
r->reader.line_number = 1;
r->istream = istream;
- r->file_name = xstrdup (file_name);
return &r->reader;
}
ssize_t n_read = u8_istream_read (r->istream, buf, n);
if (n_read < 0)
{
- msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
+ msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
return 0;
}
return n_read;
if (u8_istream_fileno (r->istream) != STDIN_FILENO)
{
if (u8_istream_close (r->istream) != 0)
- msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
+ msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
}
else
u8_istream_free (r->istream);
- free (r->file_name);
free (r);
}
static struct lex_reader_class lex_string_reader_class;
/* Creates and returns a new lex_reader for the contents of S, which must be
- encoded in UTF-8. The new reader takes ownership of S and will free it
+ encoded in the given ENCODING. The new reader takes ownership of S and will free it
with ss_dealloc() when it is closed. */
struct lex_reader *
-lex_reader_for_substring_nocopy (struct substring s)
+lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
{
struct lex_string_reader *r;
r = xmalloc (sizeof *r);
lex_reader_init (&r->reader, &lex_string_reader_class);
- r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
+ r->reader.syntax = LEX_SYNTAX_AUTO;
+ r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
r->s = s;
r->offset = 0;
}
/* Creates and returns a new lex_reader for a copy of null-terminated string S,
- which must be encoded in UTF-8. The caller retains ownership of S. */
+ which must be encoded in ENCODING. The caller retains ownership of S. */
struct lex_reader *
-lex_reader_for_string (const char *s)
+lex_reader_for_string (const char *s, const char *encoding)
{
struct substring ss;
ss_alloc_substring (&ss, ss_cstr (s));
- return lex_reader_for_substring_nocopy (ss);
+ return lex_reader_for_substring_nocopy (ss, encoding);
}
/* Formats FORMAT as a printf()-like format string and creates and returns a
new lex_reader for the formatted result. */
struct lex_reader *
-lex_reader_for_format (const char *format, ...)
+lex_reader_for_format (const char *format, const char *encoding, ...)
{
struct lex_reader *r;
va_list args;
- va_start (args, format);
- r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
+ va_start (args, encoding);
+ r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
va_end (args);
return r;