X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=src%2Flanguage%2Fdata-io%2Fdata-parser.c;h=3f19fb6262e97975ec64e4b76da859cb6d90a6b4;hb=bc5c6c1953ada1737620e27e6a968392a38d8c8f;hp=d223b05304ae06e18bad367b00348ac53401b115;hpb=5cab4cf3322f29c0ed7134d23740e07382914f20;p=pspp diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c index d223b05304..3f19fb6262 100644 --- a/src/language/data-io/data-parser.c +++ b/src/language/data-io/data-parser.c @@ -29,8 +29,10 @@ #include "data/file-handle-def.h" #include "data/settings.h" #include "language/data-io/data-reader.h" +#include "libpspp/intern.h" #include "libpspp/message.h" #include "libpspp/str.h" +#include "libpspp/string-array.h" #include "output/pivot-table.h" #include "gl/xalloc.h" @@ -42,12 +44,11 @@ /* Data parser for textual data like that read by DATA LIST. */ struct data_parser { - const struct dictionary *dict; /*Dictionary of destination */ enum data_parser_type type; /* Type of data to parse. */ int skip_records; /* Records to skip before first real data. */ struct field *fields; /* Fields to parse. */ - size_t field_cnt; /* Number of fields. */ + size_t n_fields; /* Number of fields. */ size_t field_allocated; /* Number of fields spaced allocated for. */ /* DP_DELIMITED parsers only. */ @@ -80,7 +81,7 @@ static void set_any_sep (struct data_parser *parser); /* Creates and returns a new data parser. */ struct data_parser * -data_parser_create (const struct dictionary *dict) +data_parser_create (void) { struct data_parser *parser = xmalloc (sizeof *parser); @@ -88,9 +89,8 @@ data_parser_create (const struct dictionary *dict) parser->skip_records = 0; parser->fields = NULL; - parser->field_cnt = 0; + parser->n_fields = 0; parser->field_allocated = 0; - parser->dict = dict; parser->span = true; parser->empty_line_has_field = false; @@ -115,7 +115,7 @@ data_parser_destroy (struct data_parser *parser) { size_t i; - for (i = 0; i < parser->field_cnt; i++) + for (i = 0; i < parser->n_fields; i++) free (parser->fields[i].name); free (parser->fields); ss_dealloc (&parser->quotes); @@ -138,7 +138,7 @@ data_parser_get_type (const struct data_parser *parser) void data_parser_set_type (struct data_parser *parser, enum data_parser_type type) { - assert (parser->field_cnt == 0); + assert (parser->n_fields == 0); assert (type == DP_FIXED || type == DP_DELIMITED); parser->type = type; } @@ -289,9 +289,9 @@ add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx, { struct field *field; - if (p->field_cnt == p->field_allocated) + if (p->n_fields == p->field_allocated) p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields); - field = &p->fields[p->field_cnt++]; + field = &p->fields[p->n_fields++]; field->format = *format; field->case_idx = case_idx; field->name = xstrdup (name); @@ -333,8 +333,8 @@ data_parser_add_fixed_field (struct data_parser *parser, int record, int first_column) { assert (parser->type == DP_FIXED); - assert (parser->field_cnt == 0 - || record >= parser->fields[parser->field_cnt - 1].record); + assert (parser->n_fields == 0 + || record >= parser->fields[parser->n_fields - 1].record); if (record > parser->records_per_case) parser->records_per_case = record; add_field (parser, format, case_idx, name, record, first_column); @@ -345,7 +345,7 @@ data_parser_add_fixed_field (struct data_parser *parser, bool data_parser_any_fields (const struct data_parser *parser) { - return parser->field_cnt > 0; + return parser->n_fields > 0; } static void @@ -356,19 +356,21 @@ set_any_sep (struct data_parser *parser) } static bool parse_delimited_span (const struct data_parser *, - struct dfm_reader *, struct ccase *); + struct dfm_reader *, + struct dictionary *, struct ccase *); static bool parse_delimited_no_span (const struct data_parser *, - struct dfm_reader *, struct ccase *); -static bool parse_fixed (const struct data_parser *, - struct dfm_reader *, struct ccase *); + struct dfm_reader *, + struct dictionary *, struct ccase *); +static bool parse_fixed (const struct data_parser *, struct dfm_reader *, + struct dictionary *, struct ccase *); -/* Reads a case from DFM into C, parsing it with PARSER. Returns - true if successful, false at end of file or on I/O error. +/* Reads a case from DFM into C, which matches dictionary DICT, parsing it with + PARSER. Returns true if successful, false at end of file or on I/O error. Case C must not be shared. */ bool data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, - struct ccase *c) + struct dictionary *dict, struct ccase *c) { bool retval; @@ -388,16 +390,64 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, if (parser->type == DP_DELIMITED) { if (parser->span) - retval = parse_delimited_span (parser, reader, c); + retval = parse_delimited_span (parser, reader, dict, c); else - retval = parse_delimited_no_span (parser, reader, c); + retval = parse_delimited_no_span (parser, reader, dict, c); } else - retval = parse_fixed (parser, reader, c); + retval = parse_fixed (parser, reader, dict, c); return retval; } +static void +cut_field__ (const struct data_parser *parser, const struct substring *line, + struct substring *p, size_t *n_columns, + struct string *tmp, struct substring *field) +{ + bool quoted = ss_find_byte (parser->quotes, ss_first (*p)) != SIZE_MAX; + if (quoted) + { + /* Quoted field. */ + int quote = ss_get_byte (p); + if (!ss_get_until (p, quote, field)) + msg (DW, _("Quoted string extends beyond end of line.")); + if (parser->quote_escape && ss_first (*p) == quote) + { + ds_assign_substring (tmp, *field); + while (ss_match_byte (p, quote)) + { + struct substring ss; + ds_put_byte (tmp, quote); + if (!ss_get_until (p, quote, &ss)) + msg (DW, _("Quoted string extends beyond end of line.")); + ds_put_substring (tmp, ss); + } + *field = ds_ss (tmp); + } + *n_columns = ss_length (*line) - ss_length (*p); + } + else + { + /* Regular field. */ + ss_get_bytes (p, ss_cspan (*p, ds_ss (&parser->any_sep)), field); + *n_columns = ss_length (*field); + } + + /* Skip trailing soft separator and a single hard separator if present. */ + size_t length_before_separators = ss_length (*p); + ss_ltrim (p, parser->soft_seps); + if (!ss_is_empty (*p) + && ss_find_byte (parser->hard_seps, ss_first (*p)) != SIZE_MAX) + { + ss_advance (p, 1); + ss_ltrim (p, parser->soft_seps); + } + + if (!ss_is_empty (*p) && quoted && length_before_separators == ss_length (*p)) + msg (DW, _("Missing delimiter following quoted string.")); +} + /* Extracts a delimited field from the current position in the current record according to PARSER, reading data from READER. @@ -414,9 +464,7 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, int *first_column, int *last_column, struct string *tmp, struct substring *field) { - size_t length_before_separators; struct substring line, p; - bool quoted; if (dfm_eof (reader)) return false; @@ -442,49 +490,13 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, } } + size_t n_columns; + cut_field__ (parser, &line, &p, &n_columns, tmp, field); *first_column = dfm_column_start (reader); - quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX; - if (quoted) - { - /* Quoted field. */ - int quote = ss_get_byte (&p); - if (!ss_get_until (&p, quote, field)) - msg (DW, _("Quoted string extends beyond end of line.")); - if (parser->quote_escape && ss_first (p) == quote) - { - ds_assign_substring (tmp, *field); - while (ss_match_byte (&p, quote)) - { - struct substring ss; - ds_put_byte (tmp, quote); - if (!ss_get_until (&p, quote, &ss)) - msg (DW, _("Quoted string extends beyond end of line.")); - ds_put_substring (tmp, ss); - } - *field = ds_ss (tmp); - } - *last_column = *first_column + (ss_length (line) - ss_length (p)); - } - else - { - /* Regular field. */ - ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field); - *last_column = *first_column + ss_length (*field); - } + *last_column = *first_column + n_columns; - /* Skip trailing soft separator and a single hard separator if present. */ - length_before_separators = ss_length (p); - ss_ltrim (&p, parser->soft_seps); - if (!ss_is_empty (p) - && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX) - { - ss_advance (&p, 1); - ss_ltrim (&p, parser->soft_seps); - } if (ss_is_empty (p)) dfm_forward_columns (reader, 1); - else if (quoted && length_before_separators == ss_length (p)) - msg (DW, _("Missing delimiter following quoted string.")); dfm_forward_columns (reader, ss_length (line) - ss_length (p)); return true; @@ -494,31 +506,35 @@ static void parse_error (const struct dfm_reader *reader, const struct field *field, int first_column, int last_column, char *error) { - struct msg m = { + int line_number = dfm_get_line_number (reader); + struct msg_location *location = xmalloc (sizeof *location); + *location = (struct msg_location) { + .file_name = intern_new (dfm_get_file_name (reader)), + .start = { .line = line_number, .column = first_column }, + .end = { .line = line_number, .column = last_column - 1 }, + }; + struct msg *m = xmalloc (sizeof *m); + *m = (struct msg) { .category = MSG_C_DATA, .severity = MSG_S_WARNING, - .file_name = CONST_CAST (char *, dfm_get_file_name (reader)), - .first_line = dfm_get_line_number (reader), - .last_line = m.first_line + 1, - .first_column = first_column, - .last_column = last_column, + .location = location, .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"), field->name, fmt_name (field->format.type), error), }; - msg_emit (&m); + msg_emit (m); free (error); } -/* Reads a case from READER into C, parsing it according to - fixed-format syntax rules in PARSER. - Returns true if successful, false at end of file or on I/O error. */ +/* Reads a case from READER into C, which matches DICT, parsing it according to + fixed-format syntax rules in PARSER. Returns true if successful, false at + end of file or on I/O error. */ static bool parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, - struct ccase *c) + struct dictionary *dict, struct ccase *c) { const char *input_encoding = dfm_reader_get_encoding (reader); - const char *output_encoding = dict_get_encoding (parser->dict); + const char *output_encoding = dict_get_encoding (dict); struct field *f; int row; @@ -539,18 +555,20 @@ parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, dfm_expand_tabs (reader); line = dfm_get_record (reader); - for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++) + for (; f < &parser->fields[parser->n_fields] && f->record == row; f++) { struct substring s = ss_substr (line, f->first_column - 1, f->format.w); union value *value = case_data_rw_idx (c, f->case_idx); char *error = data_in (s, input_encoding, f->format.type, + settings_get_fmt_settings (), value, fmt_var_width (&f->format), output_encoding); if (error == NULL) data_in_imply_decimals (s, input_encoding, f->format.type, - f->format.d, value); + f->format.d, settings_get_fmt_settings (), + value); else parse_error (reader, f, f->first_column, f->first_column + f->format.w, error); @@ -562,18 +580,53 @@ parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, return true; } -/* Reads a case from READER into C, parsing it according to - free-format syntax rules in PARSER. - Returns true if successful, false at end of file or on I/O error. */ +/* Splits the data line in LINE into individual text fields and returns the + number of fields. If SA is nonnull, appends each field to SA; the caller + retains ownership of SA and its contents. */ +size_t +data_parser_split (const struct data_parser *parser, + struct substring line, struct string_array *sa) +{ + size_t n = 0; + + struct string tmp = DS_EMPTY_INITIALIZER; + for (;;) + { + struct substring p = line; + ss_ltrim (&p, parser->soft_seps); + if (ss_is_empty (p)) + { + ds_destroy (&tmp); + return n; + } + + size_t n_columns; + struct substring field; + + msg_disable (); + cut_field__ (parser, &line, &p, &n_columns, &tmp, &field); + msg_enable (); + + if (sa) + string_array_append_nocopy (sa, ss_xstrdup (field)); + n++; + line = p; + } +} + +/* Reads a case from READER into C, which matches dictionary DICT, parsing it + according to free-format syntax rules in PARSER. Returns true if + successful, false at end of file or on I/O error. */ static bool parse_delimited_span (const struct data_parser *parser, - struct dfm_reader *reader, struct ccase *c) + struct dfm_reader *reader, + struct dictionary *dict, struct ccase *c) { - const char *output_encoding = dict_get_encoding (parser->dict); + const char *output_encoding = dict_get_encoding (dict); struct string tmp = DS_EMPTY_INITIALIZER; struct field *f; - for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++) + for (f = parser->fields; f < &parser->fields[parser->n_fields]; f++) { struct substring s; int first_column, last_column; @@ -597,6 +650,7 @@ parse_delimited_span (const struct data_parser *parser, const char *input_encoding = dfm_reader_get_encoding (reader); error = data_in (s, input_encoding, f->format.type, + settings_get_fmt_settings (), case_data_rw_idx (c, f->case_idx), fmt_var_width (&f->format), output_encoding); if (error != NULL) @@ -606,14 +660,15 @@ parse_delimited_span (const struct data_parser *parser, return true; } -/* Reads a case from READER into C, parsing it according to - delimited syntax rules with one case per record in PARSER. +/* Reads a case from READER into C, which matches dictionary DICT, parsing it + according to delimited syntax rules with one case per record in PARSER. Returns true if successful, false at end of file or on I/O error. */ static bool parse_delimited_no_span (const struct data_parser *parser, - struct dfm_reader *reader, struct ccase *c) + struct dfm_reader *reader, + struct dictionary *dict, struct ccase *c) { - const char *output_encoding = dict_get_encoding (parser->dict); + const char *output_encoding = dict_get_encoding (dict); struct string tmp = DS_EMPTY_INITIALIZER; struct substring s; struct field *f, *end; @@ -621,7 +676,7 @@ parse_delimited_no_span (const struct data_parser *parser, if (dfm_eof (reader)) return false; - end = &parser->fields[parser->field_cnt]; + end = &parser->fields[parser->n_fields]; for (f = parser->fields; f < end; f++) { int first_column, last_column; @@ -642,6 +697,7 @@ parse_delimited_no_span (const struct data_parser *parser, const char *input_encoding = dfm_reader_get_encoding (reader); error = data_in (s, input_encoding, f->format.type, + settings_get_fmt_settings (), case_data_rw_idx (c, f->case_idx), fmt_var_width (&f->format), output_encoding); if (error != NULL) @@ -671,7 +727,7 @@ dump_fixed_table (const struct data_parser *parser, parser->records_per_case), parser->records_per_case, fh_get_name (fh)); struct pivot_table *table = pivot_table_create__ ( - pivot_value_new_user_text (title, -1)); + pivot_value_new_user_text (title, -1), "Fixed Data Records"); free (title); pivot_dimension_create ( @@ -681,7 +737,7 @@ dump_fixed_table (const struct data_parser *parser, struct pivot_dimension *variables = pivot_dimension_create ( table, PIVOT_AXIS_ROW, N_("Variable")); variables->root->show_label = true; - for (size_t i = 0; i < parser->field_cnt; i++) + for (size_t i = 0; i < parser->n_fields; i++) { struct field *f = &parser->fields[i]; @@ -694,7 +750,7 @@ dump_fixed_table (const struct data_parser *parser, int first_column = f->first_column; int last_column = f->first_column + f->format.w - 1; - char *columns = xasprintf ("%3d-%3d", first_column, last_column); + char *columns = xasprintf ("%d-%d", first_column, last_column); pivot_table_put2 (table, 1, variable_idx, pivot_value_new_user_text (columns, -1)); free (columns); @@ -717,7 +773,8 @@ dump_delimited_table (const struct data_parser *parser, { struct pivot_table *table = pivot_table_create__ ( pivot_value_new_text_format (N_("Reading free-form data from %s."), - fh_get_name (fh))); + fh_get_name (fh)), + "Free-Form Data Records"); pivot_dimension_create ( table, PIVOT_AXIS_COLUMN, N_("Attributes"), N_("Format")); @@ -725,7 +782,7 @@ dump_delimited_table (const struct data_parser *parser, struct pivot_dimension *variables = pivot_dimension_create ( table, PIVOT_AXIS_ROW, N_("Variable")); variables->root->show_label = true; - for (size_t i = 0; i < parser->field_cnt; i++) + for (size_t i = 0; i < parser->n_fields; i++) { struct field *f = &parser->fields[i]; @@ -758,6 +815,7 @@ data_parser_output_description (struct data_parser *parser, struct data_parser_casereader { struct data_parser *parser; /* Parser. */ + struct dictionary *dict; /* Dictionary. */ struct dfm_reader *reader; /* Data file reader. */ struct caseproto *proto; /* Format of cases. */ }; @@ -772,7 +830,7 @@ static const struct casereader_class data_parser_casereader_class; void data_parser_make_active_file (struct data_parser *parser, struct dataset *ds, struct dfm_reader *reader, - struct dictionary *dict, + struct dictionary *dict, struct casereader* (*func)(struct casereader *, const struct dictionary *, void *), @@ -784,6 +842,7 @@ data_parser_make_active_file (struct data_parser *parser, struct dataset *ds, r = xmalloc (sizeof *r); r->parser = parser; + r->dict = dict_ref (dict); r->reader = reader; r->proto = caseproto_ref (dict_get_proto (dict)); casereader0 = casereader_create_sequential (NULL, r->proto, @@ -805,7 +864,7 @@ data_parser_casereader_read (struct casereader *reader UNUSED, void *r_) { struct data_parser_casereader *r = r_; struct ccase *c = case_create (r->proto); - if (data_parser_parse (r->parser, r->reader, c)) + if (data_parser_parse (r->parser, r->reader, r->dict, c)) return c; else { @@ -815,14 +874,15 @@ data_parser_casereader_read (struct casereader *reader UNUSED, void *r_) } static void -data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_) +data_parser_casereader_destroy (struct casereader *reader, void *r_) { struct data_parser_casereader *r = r_; if (dfm_reader_error (r->reader)) casereader_force_error (reader); - data_parser_destroy (r->parser); dfm_close_reader (r->reader); caseproto_unref (r->proto); + dict_unref (r->dict); + data_parser_destroy (r->parser); free (r); }