/* PSPP - a program for statistical analysis.
- Copyright (C) 2007 Free Software Foundation, Inc.
+ Copyright (C) 2007, 2009, 2010 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <language/data-io/data-reader.h>
#include <libpspp/message.h>
#include <libpspp/str.h>
-#include <output/table.h>
+#include <output/tab.h>
#include "xalloc.h"
/* Data parser for textual data like that read by DATA LIST. */
struct data_parser
{
+ const struct dictionary *dict; /*Dictionary of destination */
enum data_parser_type type; /* Type of data to parse. */
int skip_records; /* Records to skip before first real data. */
casenumber max_cases; /* Max number of cases to read. */
bool span; /* May cases span multiple records? */
bool empty_line_has_field; /* Does an empty line have an (empty) field? */
struct substring quotes; /* Characters that can quote separators. */
+ bool quote_escape; /* Doubled quote acts as escape? */
struct substring soft_seps; /* Two soft separators act like just one. */
struct substring hard_seps; /* Two hard separators yield empty fields. */
struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
/* Creates and returns a new data parser. */
struct data_parser *
-data_parser_create (void)
+data_parser_create (const struct dictionary *dict)
{
struct data_parser *parser = xmalloc (sizeof *parser);
parser->fields = NULL;
parser->field_cnt = 0;
parser->field_allocated = 0;
+ parser->dict = dict;
parser->span = true;
parser->empty_line_has_field = false;
ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
+ parser->quote_escape = false;
ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
ds_init_empty (&parser->any_sep);
ss_alloc_substring (&parser->quotes, quotes);
}
+/* If ESCAPE is false (the default setting), a character used for
+ quoting cannot itself be embedded within a quoted field. If
+ ESCAPE is true, then a quote character can be embedded within
+ a quoted field by doubling it.
+
+ This setting affects parsing of DP_DELIMITED files only, and
+ only when at least one quote character has been set (with
+ data_parser_set_quotes). */
+void
+data_parser_set_quote_escape (struct data_parser *parser, bool escape)
+{
+ parser->quote_escape = escape;
+}
+
/* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
separate fields, but consecutive soft delimiters do not yield
empty fields. (Ordinarily, only white space characters are
static bool parse_fixed (const struct data_parser *,
struct dfm_reader *, struct ccase *);
-/* Reads a case from DFM into C, parsing it with PARSER.
- Returns true if successful, false at end of file or on I/O error. */
+/* Reads a case from DFM into C, parsing it with PARSER. Returns
+ true if successful, false at end of file or on I/O error.
+
+ Case C must not be shared. */
bool
data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
struct ccase *c)
{
bool retval;
+ assert (!case_is_shared (c));
assert (data_parser_any_fields (parser));
/* Skip the requested number of records before reading the
&& dfm_get_percent_read (reader) >= parser->percent_cases)
return false;
- dfm_push (reader);
if (parser->type == DP_DELIMITED)
{
if (parser->span)
}
else
retval = parse_fixed (parser, reader, c);
- dfm_pop (reader);
return retval;
}
beginning of the field on success. */
static bool
cut_field (const struct data_parser *parser, struct dfm_reader *reader,
+ int *first_column, int *last_column, struct string *tmp,
struct substring *field)
{
struct substring line, p;
else
{
*field = p;
+ *first_column = dfm_column_start (reader);
+ *last_column = *first_column + 1;
dfm_forward_columns (reader, 1);
return true;
}
}
+ *first_column = dfm_column_start (reader);
if (ss_find_char (parser->quotes, ss_first (p)) != SIZE_MAX)
{
/* Quoted field. */
- if (!ss_get_until (&p, ss_get_char (&p), field))
+ int quote = ss_get_char (&p);
+ if (!ss_get_until (&p, quote, field))
msg (SW, _("Quoted string extends beyond end of line."));
+ if (parser->quote_escape && ss_first (p) == quote)
+ {
+ ds_assign_substring (tmp, *field);
+ while (ss_match_char (&p, quote))
+ {
+ struct substring ss;
+ ds_put_char (tmp, quote);
+ if (!ss_get_until (&p, quote, &ss))
+ msg (SW, _("Quoted string extends beyond end of line."));
+ ds_put_substring (tmp, ss);
+ }
+ *field = ds_ss (tmp);
+ }
+ *last_column = *first_column + (ss_length (line) - ss_length (p));
/* Skip trailing soft separator and a single hard separator
if present. */
{
/* Regular field. */
ss_get_chars (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
- if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p))
+ *last_column = *first_column + ss_length (*field);
+
+ if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p)
+ || ss_find_char (parser->hard_seps, p.string[0]) != SIZE_MAX)
{
/* Advance past a trailing hard separator,
regardless of whether one actually existed. If
return true;
}
+static void
+parse_error (const struct dfm_reader *reader, const struct field *field,
+ int first_column, int last_column, char *error)
+{
+ struct msg m;
+
+ m.category = MSG_C_DATA;
+ m.severity = MSG_S_WARNING;
+ m.where.file_name = CONST_CAST (char *, dfm_get_file_name (reader));
+ m.where.line_number = dfm_get_line_number (reader);
+ m.where.first_column = first_column;
+ m.where.last_column = last_column;
+ m.text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
+ field->name, fmt_name (field->format.type), error);
+ msg_emit (&m);
+
+ free (error);
+}
+
/* Reads a case from READER into C, parsing it according to
fixed-format syntax rules in PARSER.
Returns true if successful, false at end of file or on I/O error. */
parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
struct ccase *c)
{
- enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader);
+ const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+ const char *output_encoding = dict_get_encoding (parser->dict);
struct field *f;
int row;
line = dfm_get_record (reader);
for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
- data_in (ss_substr (line, f->first_column - 1,
- f->format.w),
- encoding, f->format.type, f->format.d,
- f->first_column, case_data_rw_idx (c, f->case_idx),
- fmt_var_width (&f->format));
+ {
+ struct substring s = ss_substr (line, f->first_column - 1,
+ f->format.w);
+ union value *value = case_data_rw_idx (c, f->case_idx);
+ char *error = data_in (s, input_encoding, f->format.type,
+ value, fmt_var_width (&f->format),
+ output_encoding);
+
+ if (error == NULL)
+ data_in_imply_decimals (s, input_encoding, f->format.type,
+ f->format.d, value);
+ else
+ parse_error (reader, f, f->first_column,
+ f->first_column + f->format.w, error);
+ }
dfm_forward_record (reader);
}
parse_delimited_span (const struct data_parser *parser,
struct dfm_reader *reader, struct ccase *c)
{
- enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader);
+ const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+ const char *output_encoding = dict_get_encoding (parser->dict);
+ struct string tmp = DS_EMPTY_INITIALIZER;
struct field *f;
for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
{
struct substring s;
+ int first_column, last_column;
+ char *error;
/* Cut out a field and read in a new record if necessary. */
- while (!cut_field (parser, reader, &s))
+ while (!cut_field (parser, reader,
+ &first_column, &last_column, &tmp, &s))
{
if (!dfm_eof (reader))
dfm_forward_record (reader);
if (f > parser->fields)
msg (SW, _("Partial case discarded. The first variable "
"missing was %s."), f->name);
+ ds_destroy (&tmp);
return false;
}
}
- data_in (s, encoding, f->format.type, 0,
- dfm_get_column (reader, ss_data (s)),
- case_data_rw_idx (c, f->case_idx),
- fmt_var_width (&f->format));
+ error = data_in (s, input_encoding, f->format.type,
+ case_data_rw_idx (c, f->case_idx),
+ fmt_var_width (&f->format), output_encoding);
+ if (error != NULL)
+ parse_error (reader, f, first_column, last_column, error);
}
+ ds_destroy (&tmp);
return true;
}
parse_delimited_no_span (const struct data_parser *parser,
struct dfm_reader *reader, struct ccase *c)
{
- enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader);
+ const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+ const char *output_encoding = dict_get_encoding (parser->dict);
+ struct string tmp = DS_EMPTY_INITIALIZER;
struct substring s;
- struct field *f;
+ struct field *f, *end;
if (dfm_eof (reader))
return false;
- for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
+ end = &parser->fields[parser->field_cnt];
+ for (f = parser->fields; f < end; f++)
{
- if (!cut_field (parser, reader, &s))
+ int first_column, last_column;
+ char *error;
+
+ if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
{
- if (get_undefined ())
+ if (f < end - 1 && settings_get_undefined ())
msg (SW, _("Missing value(s) for all variables from %s onward. "
"These will be filled with the system-missing value "
"or blanks, as appropriate."),
f->name);
- for (; f < &parser->fields[parser->field_cnt]; f++)
- {
- int width = fmt_var_width (&f->format);
- if (width == 0)
- case_data_rw_idx (c, f->case_idx)->f = SYSMIS;
- else
- memset (case_data_rw_idx (c, f->case_idx)->s, ' ', width);
- }
+ for (; f < end; f++)
+ value_set_missing (case_data_rw_idx (c, f->case_idx),
+ fmt_var_width (&f->format));
goto exit;
}
- data_in (s, encoding, f->format.type, 0,
- dfm_get_column (reader, ss_data (s)),
- case_data_rw_idx (c, f->case_idx),
- fmt_var_width (&f->format));
+ error = data_in (s, input_encoding, f->format.type,
+ case_data_rw_idx (c, f->case_idx),
+ fmt_var_width (&f->format), output_encoding);
+ if (error != NULL)
+ parse_error (reader, f, first_column, last_column, error);
}
s = dfm_get_record (reader);
exit:
dfm_forward_record (reader);
+ ds_destroy (&tmp);
return true;
}
\f
struct tab_table *t;
size_t i;
- t = tab_create (4, parser->field_cnt + 1, 0);
- tab_columns (t, TAB_COL_DOWN, 1);
+ t = tab_create (4, parser->field_cnt + 1);
tab_headers (t, 0, 0, 1, 0);
tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record"));
tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format"));
tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt);
tab_hline (t, TAL_2, 0, 3, 1);
- tab_dim (t, tab_natural_dimensions);
for (i = 0; i < parser->field_cnt; i++)
{
int row = i + 1;
tab_text (t, 0, row, TAB_LEFT, f->name);
- tab_text (t, 1, row, TAT_PRINTF, "%d", f->record);
- tab_text (t, 2, row, TAT_PRINTF, "%3d-%3d",
- f->first_column, f->first_column + f->format.w - 1);
+ tab_text_format (t, 1, row, 0, "%d", f->record);
+ tab_text_format (t, 2, row, 0, "%3d-%3d",
+ f->first_column, f->first_column + f->format.w - 1);
tab_text (t, 3, row, TAB_LEFT | TAB_FIX,
fmt_to_string (&f->format, fmt_string));
}
struct tab_table *t;
size_t i;
- t = tab_create (2, parser->field_cnt + 1, 0);
- tab_columns (t, TAB_COL_DOWN, 1);
+ t = tab_create (2, parser->field_cnt + 1);
tab_headers (t, 0, 0, 1, 0);
tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format"));
tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt);
tab_hline (t, TAL_2, 0, 1, 1);
- tab_dim (t, tab_natural_dimensions);
for (i = 0; i < parser->field_cnt; i++)
{
{
struct data_parser *parser; /* Parser. */
struct dfm_reader *reader; /* Data file reader. */
- size_t value_cnt; /* Number of `union value's in case. */
+ struct caseproto *proto; /* Format of cases. */
};
static const struct casereader_class data_parser_casereader_class;
r = xmalloc (sizeof *r);
r->parser = parser;
r->reader = reader;
- r->value_cnt = dict_get_next_value_idx (dict);
- casereader = casereader_create_sequential (NULL, r->value_cnt,
- -1, &data_parser_casereader_class,
- r);
+ r->proto = caseproto_ref (dict_get_proto (dict));
+ casereader = casereader_create_sequential (NULL, r->proto,
+ CASENUMBER_MAX,
+ &data_parser_casereader_class, r);
proc_set_active_file (ds, casereader, dict);
}
-static bool
-data_parser_casereader_read (struct casereader *reader UNUSED, void *r_,
- struct ccase *c)
+static struct ccase *
+data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
{
struct data_parser_casereader *r = r_;
- bool ok;
-
- case_create (c, r->value_cnt);
- ok = data_parser_parse (r->parser, r->reader, c);
- if (!ok)
- case_destroy (c);
- return ok;
+ struct ccase *c = case_create (r->proto);
+ if (data_parser_parse (r->parser, r->reader, c))
+ return c;
+ else
+ {
+ case_unref (c);
+ return NULL;
+ }
}
static void
casereader_force_error (reader);
data_parser_destroy (r->parser);
dfm_close_reader (r->reader);
+ caseproto_unref (r->proto);
free (r);
}