pintos-os.org Git - pspp/blob - src/language/data-io/data-parser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/data-io/data-parser.h"
  20
  21 #include <stdint.h>
  22 #include <stdlib.h>
  23
  24 #include "data/casereader-provider.h"
  25 #include "data/data-in.h"
  26 #include "data/dataset.h"
  27 #include "data/dictionary.h"
  28 #include "data/format.h"
  29 #include "data/file-handle-def.h"
  30 #include "data/settings.h"
  31 #include "language/data-io/data-reader.h"
  32 #include "libpspp/intern.h"
  33 #include "libpspp/message.h"
  34 #include "libpspp/str.h"
  35 #include "libpspp/string-array.h"
  36 #include "output/pivot-table.h"
  37
  38 #include "gl/xalloc.h"
  39
  40 #include "gettext.h"
  41 #define N_(msgid) msgid
  42 #define _(msgid) gettext (msgid)
  43
  44 /* Data parser for textual data like that read by DATA LIST. */
  45 struct data_parser
  46   {
  47     enum data_parser_type type; /* Type of data to parse. */
  48     int skip_records;           /* Records to skip before first real data. */
  49
  50     struct field *fields;       /* Fields to parse. */
  51     size_t n_fields;            /* Number of fields. */
  52     size_t field_allocated;     /* Number of fields spaced allocated for. */
  53
  54     /* DP_DELIMITED parsers only. */
  55     bool span;                  /* May cases span multiple records? */
  56     bool empty_line_has_field;  /* Does an empty line have an (empty) field? */
  57     bool warn_missing_fields;   /* Should missing fields be considered errors? */
  58     struct substring quotes;    /* Characters that can quote separators. */
  59     bool quote_escape;          /* Doubled quote acts as escape? */
  60     struct substring soft_seps; /* Two soft separators act like just one. */
  61     struct substring hard_seps; /* Two hard separators yield empty fields. */
  62     struct string any_sep;      /* Concatenation of soft_seps and hard_seps. */
  63
  64     /* DP_FIXED parsers only. */
  65     int records_per_case;       /* Number of records in each case. */
  66   };
  67
  68 /* How to parse one variable. */
  69 struct field
  70   {
  71     struct fmt_spec format;     /* Input format of this field. */
  72     int case_idx;               /* First value in case. */
  73     char *name;                 /* Var name for error messages and tables. */
  74
  75     /* DP_FIXED only. */
  76     int record;                 /* Record number (1-based). */
  77     int first_column;           /* First column in record (1-based). */
  78   };
  79
  80 static void set_any_sep (struct data_parser *parser);
  81
  82 /* Creates and returns a new data parser. */
  83 struct data_parser *
  84 data_parser_create (void)
  85 {
  86   struct data_parser *parser = xmalloc (sizeof *parser);
  87
  88   parser->type = DP_FIXED;
  89   parser->skip_records = 0;
  90
  91   parser->fields = NULL;
  92   parser->n_fields = 0;
  93   parser->field_allocated = 0;
  94
  95   parser->span = true;
  96   parser->empty_line_has_field = false;
  97   parser->warn_missing_fields = true;
  98   ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
  99   parser->quote_escape = false;
 100   ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
 101   ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
 102   ds_init_empty (&parser->any_sep);
 103   set_any_sep (parser);
 104
 105   parser->records_per_case = 0;
 106
 107   return parser;
 108 }
 109
 110 /* Destroys PARSER. */
 111 void
 112 data_parser_destroy (struct data_parser *parser)
 113 {
 114   if (parser != NULL)
 115     {
 116       size_t i;
 117
 118       for (i = 0; i < parser->n_fields; i++)
 119         free (parser->fields[i].name);
 120       free (parser->fields);
 121       ss_dealloc (&parser->quotes);
 122       ss_dealloc (&parser->soft_seps);
 123       ss_dealloc (&parser->hard_seps);
 124       ds_destroy (&parser->any_sep);
 125       free (parser);
 126     }
 127 }
 128
 129 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
 130 enum data_parser_type
 131 data_parser_get_type (const struct data_parser *parser)
 132 {
 133   return parser->type;
 134 }
 135
 136 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
 137    DP_FIXED). */
 138 void
 139 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
 140 {
 141   assert (parser->n_fields == 0);
 142   assert (type == DP_FIXED || type == DP_DELIMITED);
 143   parser->type = type;
 144 }
 145
 146 /* Configures PARSER to skip the specified number of
 147    INITIAL_RECORDS_TO_SKIP before parsing any data.  By default,
 148    no records are skipped. */
 149 void
 150 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
 151 {
 152   assert (initial_records_to_skip >= 0);
 153   parser->skip_records = initial_records_to_skip;
 154 }
 155
 156 /* Returns true if PARSER is configured to allow cases to span
 157    multiple records. */
 158 bool
 159 data_parser_get_span (const struct data_parser *parser)
 160 {
 161   return parser->span;
 162 }
 163
 164 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
 165    a single case to span multiple records and multiple cases to
 166    occupy a single record.  If MAY_CASES_SPAN_RECORDS is false,
 167    configures PARSER to require each record to contain exactly
 168    one case.
 169
 170    This setting affects parsing of DP_DELIMITED files only. */
 171 void
 172 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
 173 {
 174   parser->span = may_cases_span_records;
 175 }
 176
 177 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
 178    empty line as an empty field and to treat a hard delimiter
 179    followed by end-of-line as an empty field.  If
 180    EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
 181    and hard delimiters at the end of lines without emitting empty
 182    fields.
 183
 184    This setting affects parsing of DP_DELIMITED files only. */
 185 void
 186 data_parser_set_empty_line_has_field (struct data_parser *parser,
 187                                       bool empty_line_has_field)
 188 {
 189   parser->empty_line_has_field = empty_line_has_field;
 190 }
 191
 192
 193 /* If WARN_MISSING_FIELDS is true, configures PARSER to emit a warning
 194    and cause an error condition when a missing field is encountered.
 195    If  WARN_MISSING_FIELDS is false, PARSER will silently fill such
 196    fields with the system missing value.
 197
 198    This setting affects parsing of DP_DELIMITED files only. */
 199 void
 200 data_parser_set_warn_missing_fields (struct data_parser *parser,
 201                                      bool warn_missing_fields)
 202 {
 203   parser->warn_missing_fields = warn_missing_fields;
 204 }
 205
 206
 207 /* Sets the characters that may be used for quoting field
 208    contents to QUOTES.  If QUOTES is empty, quoting will be
 209    disabled.
 210
 211    The caller retains ownership of QUOTES.
 212
 213    This setting affects parsing of DP_DELIMITED files only. */
 214 void
 215 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
 216 {
 217   ss_dealloc (&parser->quotes);
 218   ss_alloc_substring (&parser->quotes, quotes);
 219 }
 220
 221 /* If ESCAPE is false (the default setting), a character used for
 222    quoting cannot itself be embedded within a quoted field.  If
 223    ESCAPE is true, then a quote character can be embedded within
 224    a quoted field by doubling it.
 225
 226    This setting affects parsing of DP_DELIMITED files only, and
 227    only when at least one quote character has been set (with
 228    data_parser_set_quotes). */
 229 void
 230 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
 231 {
 232   parser->quote_escape = escape;
 233 }
 234
 235 /* Sets PARSER's soft delimiters to DELIMITERS.  Soft delimiters
 236    separate fields, but consecutive soft delimiters do not yield
 237    empty fields.  (Ordinarily, only white space characters are
 238    appropriate soft delimiters.)
 239
 240    The caller retains ownership of DELIMITERS.
 241
 242    This setting affects parsing of DP_DELIMITED files only. */
 243 void
 244 data_parser_set_soft_delimiters (struct data_parser *parser,
 245                                  struct substring delimiters)
 246 {
 247   ss_dealloc (&parser->soft_seps);
 248   ss_alloc_substring (&parser->soft_seps, delimiters);
 249   set_any_sep (parser);
 250 }
 251
 252 /* Sets PARSER's hard delimiters to DELIMITERS.  Hard delimiters
 253    separate fields.  A consecutive pair of hard delimiters yield
 254    an empty field.
 255
 256    The caller retains ownership of DELIMITERS.
 257
 258    This setting affects parsing of DP_DELIMITED files only. */
 259 void
 260 data_parser_set_hard_delimiters (struct data_parser *parser,
 261                                  struct substring delimiters)
 262 {
 263   ss_dealloc (&parser->hard_seps);
 264   ss_alloc_substring (&parser->hard_seps, delimiters);
 265   set_any_sep (parser);
 266 }
 267
 268 /* Returns the number of records per case. */
 269 int
 270 data_parser_get_records (const struct data_parser *parser)
 271 {
 272   return parser->records_per_case;
 273 }
 274
 275 /* Sets the number of records per case to RECORDS_PER_CASE.
 276
 277    This setting affects parsing of DP_FIXED files only. */
 278 void
 279 data_parser_set_records (struct data_parser *parser, int records_per_case)
 280 {
 281   assert (records_per_case >= 0);
 282   assert (records_per_case >= parser->records_per_case);
 283   parser->records_per_case = records_per_case;
 284 }
 285
 286 static void
 287 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
 288            const char *name, int record, int first_column)
 289 {
 290   struct field *field;
 291
 292   if (p->n_fields == p->field_allocated)
 293     p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
 294   field = &p->fields[p->n_fields++];
 295   field->format = *format;
 296   field->case_idx = case_idx;
 297   field->name = xstrdup (name);
 298   field->record = record;
 299   field->first_column = first_column;
 300 }
 301
 302 /* Adds a delimited field to the field parsed by PARSER, which
 303    must be configured as a DP_DELIMITED parser.  The field is
 304    parsed as input format FORMAT.  Its data will be stored into case
 305    index CASE_INDEX.  Errors in input data will be reported
 306    against variable NAME. */
 307 void
 308 data_parser_add_delimited_field (struct data_parser *parser,
 309                                  const struct fmt_spec *format, int case_idx,
 310                                  const char *name)
 311 {
 312   assert (parser->type == DP_DELIMITED);
 313   add_field (parser, format, case_idx, name, 0, 0);
 314 }
 315
 316 /* Adds a fixed field to the field parsed by PARSER, which
 317    must be configured as a DP_FIXED parser.  The field is
 318    parsed as input format FORMAT.  Its data will be stored into case
 319    index CASE_INDEX.  Errors in input data will be reported
 320    against variable NAME.  The field will be drawn from the
 321    FORMAT->w columns in 1-based RECORD starting at 1-based
 322    column FIRST_COLUMN.
 323
 324    RECORD must be at least as great as that of any field already
 325    added; that is, fields must be added in increasing order of
 326    record number.  If RECORD is greater than the current number
 327    of records per case, the number of records per case are
 328    increased as needed.  */
 329 void
 330 data_parser_add_fixed_field (struct data_parser *parser,
 331                              const struct fmt_spec *format, int case_idx,
 332                              const char *name,
 333                              int record, int first_column)
 334 {
 335   assert (parser->type == DP_FIXED);
 336   assert (parser->n_fields == 0
 337           || record >= parser->fields[parser->n_fields - 1].record);
 338   if (record > parser->records_per_case)
 339     parser->records_per_case = record;
 340   add_field (parser, format, case_idx, name, record, first_column);
 341 }
 342
 343 /* Returns true if any fields have been added to PARSER, false
 344    otherwise. */
 345 bool
 346 data_parser_any_fields (const struct data_parser *parser)
 347 {
 348   return parser->n_fields > 0;
 349 }
 350
 351 static void
 352 set_any_sep (struct data_parser *parser)
 353 {
 354   ds_assign_substring (&parser->any_sep, parser->soft_seps);
 355   ds_put_substring (&parser->any_sep, parser->hard_seps);
 356 }
 357 \f
 358 static bool parse_delimited_span (const struct data_parser *,
 359                                   struct dfm_reader *,
 360                                   struct dictionary *, struct ccase *);
 361 static bool parse_delimited_no_span (const struct data_parser *,
 362                                      struct dfm_reader *,
 363                                      struct dictionary *, struct ccase *);
 364 static bool parse_fixed (const struct data_parser *, struct dfm_reader *,
 365                          struct dictionary *, struct ccase *);
 366
 367 /* Reads a case from DFM into C, which matches dictionary DICT, parsing it with
 368    PARSER.  Returns true if successful, false at end of file or on I/O error.
 369
 370    Case C must not be shared. */
 371 bool
 372 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
 373                    struct dictionary *dict, struct ccase *c)
 374 {
 375   bool retval;
 376
 377   assert (!case_is_shared (c));
 378   assert (data_parser_any_fields (parser));
 379
 380   /* Skip the requested number of records before reading the
 381      first case. */
 382   for (; parser->skip_records > 0; parser->skip_records--)
 383     {
 384       if (dfm_eof (reader))
 385         return false;
 386       dfm_forward_record (reader);
 387     }
 388
 389   /* Limit cases. */
 390   if (parser->type == DP_DELIMITED)
 391     {
 392       if (parser->span)
 393         retval = parse_delimited_span (parser, reader, dict, c);
 394       else
 395         retval = parse_delimited_no_span (parser, reader, dict, c);
 396     }
 397   else
 398     retval = parse_fixed (parser, reader, dict, c);
 399
 400   return retval;
 401 }
 402
 403 static void
 404 cut_field__ (const struct data_parser *parser, const struct substring *line,
 405              struct substring *p, size_t *n_columns,
 406              struct string *tmp, struct substring *field)
 407 {
 408   bool quoted = ss_find_byte (parser->quotes, ss_first (*p)) != SIZE_MAX;
 409   if (quoted)
 410     {
 411       /* Quoted field. */
 412       int quote = ss_get_byte (p);
 413       if (!ss_get_until (p, quote, field))
 414         msg (DW, _("Quoted string extends beyond end of line."));
 415       if (parser->quote_escape && ss_first (*p) == quote)
 416         {
 417           ds_assign_substring (tmp, *field);
 418           while (ss_match_byte (p, quote))
 419             {
 420               struct substring ss;
 421               ds_put_byte (tmp, quote);
 422               if (!ss_get_until (p, quote, &ss))
 423                 msg (DW, _("Quoted string extends beyond end of line."));
 424               ds_put_substring (tmp, ss);
 425             }
 426           *field = ds_ss (tmp);
 427         }
 428       *n_columns = ss_length (*line) - ss_length (*p);
 429     }
 430   else
 431     {
 432       /* Regular field. */
 433       ss_get_bytes (p, ss_cspan (*p, ds_ss (&parser->any_sep)), field);
 434       *n_columns = ss_length (*field);
 435     }
 436
 437   /* Skip trailing soft separator and a single hard separator if present. */
 438   size_t length_before_separators = ss_length (*p);
 439   ss_ltrim (p, parser->soft_seps);
 440   if (!ss_is_empty (*p)
 441       && ss_find_byte (parser->hard_seps, ss_first (*p)) != SIZE_MAX)
 442     {
 443       ss_advance (p, 1);
 444       ss_ltrim (p, parser->soft_seps);
 445     }
 446
 447   if (!ss_is_empty (*p) && quoted && length_before_separators == ss_length (*p))
 448     msg (DW, _("Missing delimiter following quoted string."));
 449 }
 450
 451 /* Extracts a delimited field from the current position in the
 452    current record according to PARSER, reading data from READER.
 453
 454    *FIELD is set to the field content.  The caller must not or
 455    destroy this constant string.
 456
 457    Sets *FIRST_COLUMN to the 1-based column number of the start of
 458    the extracted field, and *LAST_COLUMN to the end of the extracted
 459    field.
 460
 461    Returns true on success, false on failure. */
 462 static bool
 463 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
 464            int *first_column, int *last_column, struct string *tmp,
 465            struct substring *field)
 466 {
 467   struct substring line, p;
 468
 469   if (dfm_eof (reader))
 470     return false;
 471   if (ss_is_empty (parser->hard_seps))
 472     dfm_expand_tabs (reader);
 473   line = p = dfm_get_record (reader);
 474
 475   /* Skip leading soft separators. */
 476   ss_ltrim (&p, parser->soft_seps);
 477
 478   /* Handle empty or completely consumed lines. */
 479   if (ss_is_empty (p))
 480     {
 481       if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
 482         return false;
 483       else
 484         {
 485           *field = p;
 486           *first_column = dfm_column_start (reader);
 487           *last_column = *first_column + 1;
 488           dfm_forward_columns (reader, 1);
 489           return true;
 490         }
 491     }
 492
 493   size_t n_columns;
 494   cut_field__ (parser, &line, &p, &n_columns, tmp, field);
 495   *first_column = dfm_column_start (reader);
 496   *last_column = *first_column + n_columns;
 497
 498   if (ss_is_empty (p))
 499     dfm_forward_columns (reader, 1);
 500   dfm_forward_columns (reader, ss_length (line) - ss_length (p));
 501
 502   return true;
 503 }
 504
 505 static void
 506 parse_error (const struct dfm_reader *reader, const struct field *field,
 507              int first_column, int last_column, char *error)
 508 {
 509   int line_number = dfm_get_line_number (reader);
 510   struct msg_location *location = xmalloc (sizeof *location);
 511   *location = (struct msg_location) {
 512     .file_name = intern_new (dfm_get_file_name (reader)),
 513     .start = { .line = line_number, .column = first_column },
 514     .end = { .line = line_number, .column = last_column - 1 },
 515   };
 516   struct msg *m = xmalloc (sizeof *m);
 517   *m = (struct msg) {
 518     .category = MSG_C_DATA,
 519     .severity = MSG_S_WARNING,
 520     .location = location,
 521     .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
 522                        field->name, fmt_name (field->format.type), error),
 523   };
 524   msg_emit (m);
 525
 526   free (error);
 527 }
 528
 529 /* Reads a case from READER into C, which matches DICT, parsing it according to
 530    fixed-format syntax rules in PARSER.  Returns true if successful, false at
 531    end of file or on I/O error. */
 532 static bool
 533 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
 534              struct dictionary *dict, struct ccase *c)
 535 {
 536   const char *input_encoding = dfm_reader_get_encoding (reader);
 537   const char *output_encoding = dict_get_encoding (dict);
 538   struct field *f;
 539   int row;
 540
 541   if (dfm_eof (reader))
 542     return false;
 543
 544   f = parser->fields;
 545   for (row = 1; row <= parser->records_per_case; row++)
 546     {
 547       struct substring line;
 548
 549       if (dfm_eof (reader))
 550         {
 551           msg (DW, _("Partial case of %d of %d records discarded."),
 552                row - 1, parser->records_per_case);
 553           return false;
 554         }
 555       dfm_expand_tabs (reader);
 556       line = dfm_get_record (reader);
 557
 558       for (; f < &parser->fields[parser->n_fields] && f->record == row; f++)
 559         {
 560           struct substring s = ss_substr (line, f->first_column - 1,
 561                                           f->format.w);
 562           union value *value = case_data_rw_idx (c, f->case_idx);
 563           char *error = data_in (s, input_encoding, f->format.type,
 564                                  settings_get_fmt_settings (),
 565                                  value, fmt_var_width (&f->format),
 566                                  output_encoding);
 567
 568           if (error == NULL)
 569             data_in_imply_decimals (s, input_encoding, f->format.type,
 570                                     f->format.d, settings_get_fmt_settings (),
 571                                     value);
 572           else
 573             parse_error (reader, f, f->first_column,
 574                          f->first_column + f->format.w, error);
 575         }
 576
 577       dfm_forward_record (reader);
 578     }
 579
 580   return true;
 581 }
 582
 583 /* Splits the data line in LINE into individual text fields and returns the
 584    number of fields.  If SA is nonnull, appends each field to SA; the caller
 585    retains ownership of SA and its contents.  */
 586 size_t
 587 data_parser_split (const struct data_parser *parser,
 588                    struct substring line, struct string_array *sa)
 589 {
 590   size_t n = 0;
 591
 592   struct string tmp = DS_EMPTY_INITIALIZER;
 593   for (;;)
 594     {
 595       struct substring p = line;
 596       ss_ltrim (&p, parser->soft_seps);
 597       if (ss_is_empty (p))
 598         {
 599           ds_destroy (&tmp);
 600           return n;
 601         }
 602
 603       size_t n_columns;
 604       struct substring field;
 605
 606       msg_disable ();
 607       cut_field__ (parser, &line, &p, &n_columns, &tmp, &field);
 608       msg_enable ();
 609
 610       if (sa)
 611         string_array_append_nocopy (sa, ss_xstrdup (field));
 612       n++;
 613       line = p;
 614     }
 615 }
 616
 617 /* Reads a case from READER into C, which matches dictionary DICT, parsing it
 618    according to free-format syntax rules in PARSER.  Returns true if
 619    successful, false at end of file or on I/O error. */
 620 static bool
 621 parse_delimited_span (const struct data_parser *parser,
 622                       struct dfm_reader *reader,
 623                       struct dictionary *dict, struct ccase *c)
 624 {
 625   const char *output_encoding = dict_get_encoding (dict);
 626   struct string tmp = DS_EMPTY_INITIALIZER;
 627   struct field *f;
 628
 629   for (f = parser->fields; f < &parser->fields[parser->n_fields]; f++)
 630     {
 631       struct substring s;
 632       int first_column, last_column;
 633       char *error;
 634
 635       /* Cut out a field and read in a new record if necessary. */
 636       while (!cut_field (parser, reader,
 637                          &first_column, &last_column, &tmp, &s))
 638         {
 639           if (!dfm_eof (reader))
 640             dfm_forward_record (reader);
 641           if (dfm_eof (reader))
 642             {
 643               if (f > parser->fields)
 644                 msg (DW, _("Partial case discarded.  The first variable "
 645                            "missing was %s."), f->name);
 646               ds_destroy (&tmp);
 647               return false;
 648             }
 649         }
 650
 651       const char *input_encoding = dfm_reader_get_encoding (reader);
 652       error = data_in (s, input_encoding, f->format.type,
 653                        settings_get_fmt_settings (),
 654                        case_data_rw_idx (c, f->case_idx),
 655                        fmt_var_width (&f->format), output_encoding);
 656       if (error != NULL)
 657         parse_error (reader, f, first_column, last_column, error);
 658     }
 659   ds_destroy (&tmp);
 660   return true;
 661 }
 662
 663 /* Reads a case from READER into C, which matches dictionary DICT, parsing it
 664    according to delimited syntax rules with one case per record in PARSER.
 665    Returns true if successful, false at end of file or on I/O error. */
 666 static bool
 667 parse_delimited_no_span (const struct data_parser *parser,
 668                          struct dfm_reader *reader,
 669                          struct dictionary *dict, struct ccase *c)
 670 {
 671   const char *output_encoding = dict_get_encoding (dict);
 672   struct string tmp = DS_EMPTY_INITIALIZER;
 673   struct substring s;
 674   struct field *f, *end;
 675
 676   if (dfm_eof (reader))
 677     return false;
 678
 679   end = &parser->fields[parser->n_fields];
 680   for (f = parser->fields; f < end; f++)
 681     {
 682       int first_column, last_column;
 683       char *error;
 684
 685       if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
 686         {
 687           if (f < end - 1 && settings_get_undefined () && parser->warn_missing_fields)
 688             msg (DW, _("Missing value(s) for all variables from %s onward.  "
 689                        "These will be filled with the system-missing value "
 690                        "or blanks, as appropriate."),
 691                  f->name);
 692           for (; f < end; f++)
 693             value_set_missing (case_data_rw_idx (c, f->case_idx),
 694                                fmt_var_width (&f->format));
 695           goto exit;
 696         }
 697
 698       const char *input_encoding = dfm_reader_get_encoding (reader);
 699       error = data_in (s, input_encoding, f->format.type,
 700                        settings_get_fmt_settings (),
 701                        case_data_rw_idx (c, f->case_idx),
 702                        fmt_var_width (&f->format), output_encoding);
 703       if (error != NULL)
 704         parse_error (reader, f, first_column, last_column, error);
 705     }
 706
 707   s = dfm_get_record (reader);
 708   ss_ltrim (&s, parser->soft_seps);
 709   if (!ss_is_empty (s))
 710     msg (DW, _("Record ends in data not part of any field."));
 711
 712 exit:
 713   dfm_forward_record (reader);
 714   ds_destroy (&tmp);
 715   return true;
 716 }
 717 \f
 718 /* Displays a table giving information on fixed-format variable
 719    parsing on DATA LIST. */
 720 static void
 721 dump_fixed_table (const struct data_parser *parser,
 722                   const struct file_handle *fh)
 723 {
 724   /* XXX This should not be preformatted. */
 725   char *title = xasprintf (ngettext ("Reading %d record from %s.",
 726                                      "Reading %d records from %s.",
 727                                      parser->records_per_case),
 728                            parser->records_per_case, fh_get_name (fh));
 729   struct pivot_table *table = pivot_table_create__ (
 730     pivot_value_new_user_text (title, -1), "Fixed Data Records");
 731   free (title);
 732
 733   pivot_dimension_create (
 734     table, PIVOT_AXIS_COLUMN, N_("Attributes"),
 735     N_("Record"), N_("Columns"), N_("Format"));
 736
 737   struct pivot_dimension *variables = pivot_dimension_create (
 738     table, PIVOT_AXIS_ROW, N_("Variable"));
 739   variables->root->show_label = true;
 740   for (size_t i = 0; i < parser->n_fields; i++)
 741     {
 742       struct field *f = &parser->fields[i];
 743
 744       /* XXX It would be better to have the actual variable here. */
 745       int variable_idx = pivot_category_create_leaf (
 746         variables->root, pivot_value_new_user_text (f->name, -1));
 747
 748       pivot_table_put2 (table, 0, variable_idx,
 749                         pivot_value_new_integer (f->record));
 750
 751       int first_column = f->first_column;
 752       int last_column = f->first_column + f->format.w - 1;
 753       char *columns = xasprintf ("%d-%d", first_column, last_column);
 754       pivot_table_put2 (table, 1, variable_idx,
 755                         pivot_value_new_user_text (columns, -1));
 756       free (columns);
 757
 758       char str[FMT_STRING_LEN_MAX + 1];
 759       pivot_table_put2 (table, 2, variable_idx,
 760                         pivot_value_new_user_text (
 761                           fmt_to_string (&f->format, str), -1));
 762
 763     }
 764
 765   pivot_table_submit (table);
 766 }
 767
 768 /* Displays a table giving information on free-format variable parsing
 769    on DATA LIST. */
 770 static void
 771 dump_delimited_table (const struct data_parser *parser,
 772                       const struct file_handle *fh)
 773 {
 774   struct pivot_table *table = pivot_table_create__ (
 775     pivot_value_new_text_format (N_("Reading free-form data from %s."),
 776                                  fh_get_name (fh)),
 777     "Free-Form Data Records");
 778
 779   pivot_dimension_create (
 780     table, PIVOT_AXIS_COLUMN, N_("Attributes"), N_("Format"));
 781
 782   struct pivot_dimension *variables = pivot_dimension_create (
 783     table, PIVOT_AXIS_ROW, N_("Variable"));
 784   variables->root->show_label = true;
 785   for (size_t i = 0; i < parser->n_fields; i++)
 786     {
 787       struct field *f = &parser->fields[i];
 788
 789       /* XXX It would be better to have the actual variable here. */
 790       int variable_idx = pivot_category_create_leaf (
 791         variables->root, pivot_value_new_user_text (f->name, -1));
 792
 793       char str[FMT_STRING_LEN_MAX + 1];
 794       pivot_table_put2 (table, 0, variable_idx,
 795                         pivot_value_new_user_text (
 796                           fmt_to_string (&f->format, str), -1));
 797     }
 798
 799   pivot_table_submit (table);
 800 }
 801
 802 /* Displays a table giving information on how PARSER will read
 803    data from FH. */
 804 void
 805 data_parser_output_description (struct data_parser *parser,
 806                                 const struct file_handle *fh)
 807 {
 808   if (parser->type == DP_FIXED)
 809     dump_fixed_table (parser, fh);
 810   else
 811     dump_delimited_table (parser, fh);
 812 }
 813 \f
 814 /* Data parser input program. */
 815 struct data_parser_casereader
 816   {
 817     struct data_parser *parser; /* Parser. */
 818     struct dictionary *dict;    /* Dictionary. */
 819     struct dfm_reader *reader;  /* Data file reader. */
 820     struct caseproto *proto;    /* Format of cases. */
 821   };
 822
 823 static const struct casereader_class data_parser_casereader_class;
 824
 825 /* Replaces DS's active dataset by an input program that reads data
 826    from READER according to the rules in PARSER, using DICT as
 827    the underlying dictionary.  Ownership of PARSER and READER is
 828    transferred to the input program, and ownership of DICT is
 829    transferred to the dataset. */
 830 void
 831 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
 832                                struct dfm_reader *reader,
 833                               struct dictionary *dict,
 834                                struct casereader* (*func)(struct casereader *,
 835                                                           const struct dictionary *,
 836                                                           void *),
 837                                void *ud)
 838 {
 839   struct data_parser_casereader *r;
 840   struct casereader *casereader0;
 841   struct casereader *casereader1;
 842
 843   r = xmalloc (sizeof *r);
 844   r->parser = parser;
 845   r->dict = dict_ref (dict);
 846   r->reader = reader;
 847   r->proto = caseproto_ref (dict_get_proto (dict));
 848   casereader0 = casereader_create_sequential (NULL, r->proto,
 849                                              CASENUMBER_MAX,
 850                                              &data_parser_casereader_class, r);
 851
 852   if (func)
 853     casereader1 = func (casereader0, dict, ud);
 854   else
 855     casereader1 = casereader0;
 856
 857   dataset_set_dict (ds, dict);
 858   dataset_set_source (ds, casereader1);
 859 }
 860
 861
 862 static struct ccase *
 863 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
 864 {
 865   struct data_parser_casereader *r = r_;
 866   struct ccase *c = case_create (r->proto);
 867   if (data_parser_parse (r->parser, r->reader, r->dict, c))
 868     return c;
 869   else
 870     {
 871       case_unref (c);
 872       return NULL;
 873     }
 874 }
 875
 876 static void
 877 data_parser_casereader_destroy (struct casereader *reader, void *r_)
 878 {
 879   struct data_parser_casereader *r = r_;
 880   if (dfm_reader_error (r->reader))
 881     casereader_force_error (reader);
 882   dfm_close_reader (r->reader);
 883   caseproto_unref (r->proto);
 884   dict_unref (r->dict);
 885   data_parser_destroy (r->parser);
 886   free (r);
 887 }
 888
 889 static const struct casereader_class data_parser_casereader_class =
 890   {
 891     data_parser_casereader_read,
 892     data_parser_casereader_destroy,
 893     NULL,
 894     NULL,
 895   };