X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=7684acbc576a40c77c78c7a2ea6b7d57360127ac;hb=refs%2Fheads%2Fctables7;hp=b49ccf785675639aa48dffd77a279e1e1d501a82;hpb=3dd0f6ae0d5eb73a2270a243e443c4ae03c2c16e;p=pspp diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index b49ccf7856..7684acbc57 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2000, 2006-2007, 2009-2016, 2021 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -200,9 +200,10 @@ struct sfm_reader enum integer_format integer_format; /* On-disk integer format. */ enum float_format float_format; /* On-disk floating point format. */ struct sfm_var *sfm_vars; /* Variables. */ - size_t sfm_var_cnt; /* Number of variables. */ - int case_cnt; /* Number of cases */ + size_t sfm_n_vars; /* Number of variables. */ + int n_cases; /* Number of cases */ const char *encoding; /* String encoding. */ + bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */ /* Decompression. */ enum any_compression compression; @@ -233,10 +234,6 @@ sfm_reader_cast (const struct any_reader *r_) static bool sfm_close (struct any_reader *); -static struct variable *lookup_var_by_index (struct sfm_reader *, off_t, - const struct sfm_var_record *, - size_t n, int idx); - static void sys_msg (struct sfm_reader *r, off_t, int class, const char *format, va_list args) PRINTF_FORMAT (4, 0); @@ -282,7 +279,7 @@ static bool read_variable_record (struct sfm_reader *, struct sfm_var_record *); static bool read_value_label_record (struct sfm_reader *, struct sfm_value_label_record *); -static struct sfm_document_record *read_document_record (struct sfm_reader *); +static bool read_document_record (struct sfm_reader *); static bool read_extension_record (struct sfm_reader *, int subtype, struct sfm_extension_record **); static bool skip_extension_record (struct sfm_reader *, int subtype); @@ -334,7 +331,7 @@ static bool parse_variable_records (struct sfm_reader *, struct dictionary *, struct sfm_var_record *, size_t n); static void parse_format_spec (struct sfm_reader *, off_t pos, unsigned int format, enum which_format, - struct variable *, int *format_warning_cnt); + struct variable *, int *format_n_warnings); static void parse_document (struct dictionary *, struct sfm_document_record *); static void parse_display_parameters (struct sfm_reader *, const struct sfm_extension_record *, @@ -357,10 +354,10 @@ static void parse_long_var_name_map (struct sfm_reader *, static bool parse_long_string_map (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); -static bool parse_value_labels (struct sfm_reader *, struct dictionary *, - const struct sfm_var_record *, - size_t n_var_recs, - const struct sfm_value_label_record *); +static void parse_value_labels (struct sfm_reader *, struct dictionary *); +static struct variable *parse_weight_var (struct sfm_reader *, + const struct sfm_var_record *, size_t n_var_recs, + int idx); static void parse_data_file_attributes (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); @@ -394,10 +391,9 @@ static struct any_reader * sfm_open (struct file_handle *fh) { size_t allocated_mrsets = 0; - struct sfm_reader *r; /* Create and initialize reader. */ - r = xzalloc (sizeof *r); + struct sfm_reader *r = XZALLOC (struct sfm_reader); r->any_reader.klass = &sys_file_reader_class; r->pool = pool_create (); pool_register (r->pool, free, r); @@ -496,12 +492,8 @@ read_record (struct sfm_reader *r, int type, case 6: if (r->document != NULL) - { - sys_error (r, r->pos, _("Duplicate type 6 (document) record.")); - return false; - } - r->document = read_document_record (r); - return r->document != NULL; + sys_warn (r, r->pos, _("Duplicate type 6 (document) record.")); + return read_document_record (r); case 7: if (!read_int (r, &subtype)) @@ -523,7 +515,7 @@ read_record (struct sfm_reader *r, int type, 18. I'm surprised that SPSS puts up with this. */ struct sfm_extension_record *ext; bool ok = read_extension_record (r, subtype, &ext); - if (ok) + if (ok && ext) ll_push_tail (&r->var_attrs, &ext->ll); return ok; } @@ -647,6 +639,13 @@ add_id (struct get_strings_aux *aux, const char *id, const char *title, ...) va_end (args); } +static const char * +skip_prefix (const char *s, const char *prefix) +{ + size_t prefix_len = strlen (prefix); + return !strncmp (s, prefix, prefix_len) ? s + prefix_len : s; +} + /* Retrieves significant string data from R in its raw format, to allow the caller to try to detect the encoding in use. @@ -696,7 +695,7 @@ sfm_get_strings (const struct any_reader *r_, struct pool *pool, add_string (&aux, r->header.creation_date, _("Creation Date")); add_string (&aux, r->header.creation_time, _("Creation Time")); - add_string (&aux, r->header.eye_catcher, _("Product")); + add_string (&aux, skip_prefix (r->header.eye_catcher, "@(#) "), _("Product")); add_string (&aux, r->header.file_label, _("File Label")); if (r->extensions[EXT_PRODUCT_INFO]) @@ -761,7 +760,6 @@ sfm_decode (struct any_reader *r_, const char *encoding, { struct sfm_reader *r = sfm_reader_cast (r_); struct dictionary *dict; - size_t i; if (encoding == NULL) { @@ -808,25 +806,10 @@ sfm_decode (struct any_reader *r_, const char *encoding, /* Parse value labels and the weight variable immediately after the variable records. These records use indexes into var_recs[], so we must parse them before those indexes become invalidated by very long string variables. */ - for (i = 0; i < r->n_labels; i++) - if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i])) - goto error; + parse_value_labels (r, dict); if (r->header.weight_idx != 0) - { - struct variable *weight_var; - - weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars, - r->header.weight_idx); - if (weight_var != NULL) - { - if (var_is_numeric (weight_var)) - dict_set_weight (dict, weight_var); - else - sys_warn (r, -1, _("Ignoring string variable `%s' set " - "as weighting variable."), - var_get_name (weight_var)); - } - } + dict_set_weight (dict, parse_weight_var (r, r->vars, r->n_vars, + r->header.weight_idx)); if (r->extensions[EXT_DISPLAY] != NULL) parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict); @@ -862,7 +845,7 @@ sfm_decode (struct any_reader *r_, const char *encoding, amount that the header claims. SPSS version 13 gets this wrong when very long strings are involved, so don't warn in that case. */ - if (r->header.nominal_case_size != -1 + if (r->header.nominal_case_size > 0 && r->header.nominal_case_size != r->n_vars && r->info.version_major != 13) sys_warn (r, -1, _("File header claims %d variable positions but " @@ -873,7 +856,7 @@ sfm_decode (struct any_reader *r_, const char *encoding, sfm_read_case to use. We cannot use the `struct variable's from the dictionary we created, because the caller owns the dictionary and may destroy or modify its variables. */ - sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt); + sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_n_vars); pool_register (r->pool, free, r->sfm_vars); r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool); @@ -885,13 +868,12 @@ sfm_decode (struct any_reader *r_, const char *encoding, } return casereader_create_sequential - (NULL, r->proto, - r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt, - &sys_file_casereader_class, r); + (NULL, r->proto, r->n_cases == -1 ? CASENUMBER_MAX : r->n_cases, + &sys_file_casereader_class, r); error: sfm_close (r_); - dict_destroy (dict); + dict_unref (dict); *dictp = NULL; return NULL; } @@ -968,6 +950,8 @@ read_header (struct sfm_reader *r, struct any_read_info *info, if (!read_string (r, header->magic, sizeof header->magic) || !read_string (r, header->eye_catcher, sizeof header->eye_catcher)) return false; + r->written_by_readstat = strstr (header->eye_catcher, + "https://github.com/WizardMac/ReadStat"); if (!strcmp (ASCII_MAGIC, header->magic) || !strcmp (EBCDIC_MAGIC, header->magic)) @@ -1009,7 +993,7 @@ read_header (struct sfm_reader *r, struct any_read_info *info, r->compression = ANY_COMP_NONE; else if (compressed == 1) r->compression = ANY_COMP_SIMPLE; - else if (compressed != 0) + else { sys_error (r, 0, "System file header has invalid compression " "value %d.", compressed); @@ -1031,10 +1015,10 @@ read_header (struct sfm_reader *r, struct any_read_info *info, if (!read_int (r, &header->weight_idx)) return false; - if (!read_int (r, &r->case_cnt)) + if (!read_int (r, &r->n_cases)) return false; - if ( r->case_cnt > INT_MAX / 2) - r->case_cnt = -1; + if (r->n_cases > INT_MAX / 2) + r->n_cases = -1; /* Identify floating-point format and obtain compression bias. */ if (!read_bytes (r, raw_bias, sizeof raw_bias)) @@ -1073,7 +1057,7 @@ read_header (struct sfm_reader *r, struct any_read_info *info, info->integer_format = r->integer_format; info->float_format = r->float_format; info->compression = r->compression; - info->case_cnt = r->case_cnt; + info->n_cases = r->n_cases; return true; } @@ -1229,33 +1213,35 @@ read_value_label_record (struct sfm_reader *r, return true; } -/* Reads a document record from R and returns it. */ -static struct sfm_document_record * +/* Reads a document record from R. Returns true if successful, false on + error. */ +static bool read_document_record (struct sfm_reader *r) { - struct sfm_document_record *record; int n_lines; - - record = pool_malloc (r->pool, sizeof *record); - record->pos = r->pos; - if (!read_int (r, &n_lines)) - return NULL; - if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH) + return false; + else if (n_lines == 0) + return true; + else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH) { - sys_error (r, record->pos, + sys_error (r, r->pos, _("Number of document lines (%d) " "must be greater than 0 and less than %d."), n_lines, INT_MAX / DOC_LINE_LENGTH); - return NULL; + return false; } + struct sfm_document_record *record; + record = pool_malloc (r->pool, sizeof *record); + record->pos = r->pos; record->n_lines = n_lines; record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines); if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines)) - return NULL; + return false; - return record; + r->document = record; + return true; } static bool @@ -1408,6 +1394,15 @@ parse_header (struct sfm_reader *r, const struct sfm_header_record *header, info->product = ss_xstrdup (product); } +static struct variable * +add_var_with_generated_name (struct dictionary *dict, int width) +{ + char *name = dict_make_unique_var_name (dict, NULL, NULL); + struct variable *var = dict_create_var_assert (dict, name, width); + free (name); + return var; +} + /* Reads a variable (type 2) record from R and adds the corresponding variable to DICT. Also skips past additional variable records for long string @@ -1420,9 +1415,8 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, struct sfm_var_record *rec; int n_warnings = 0; - for (rec = var_recs; rec < &var_recs[n_var_recs]; ) + for (rec = var_recs; rec < &var_recs[n_var_recs];) { - struct variable *var; size_t n_values; char *name; size_t i; @@ -1431,13 +1425,6 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, rec->name, -1, r->pool); name[strcspn (name, " ")] = '\0'; - if (!dict_id_is_valid (dict, name, false) - || name[0] == '$' || name[0] == '#') - { - sys_error (r, rec->pos, _("Invalid variable name `%s'."), name); - return false; - } - if (rec->width < 0 || rec->width > 255) { sys_error (r, rec->pos, @@ -1445,19 +1432,30 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, return false; } - var = rec->var = dict_create_var (dict, name, rec->width); - if (var == NULL) + struct variable *var; + if (!dict_id_is_valid (dict, name, false) + || name[0] == '$' || name[0] == '#') { - char *new_name = dict_make_unique_var_name (dict, NULL, NULL); - sys_warn (r, rec->pos, _("Renaming variable with duplicate name " - "`%s' to `%s'."), - name, new_name); - var = rec->var = dict_create_var_assert (dict, new_name, rec->width); - free (new_name); + var = add_var_with_generated_name (dict, rec->width); + sys_warn (r, rec->pos, _("Renaming variable with invalid name " + "`%s' to `%s'."), name, var_get_name (var)); + } + else + { + var = dict_create_var (dict, name, rec->width); + if (var == NULL) + { + var = add_var_with_generated_name (dict, rec->width); + sys_warn (r, rec->pos, _("Renaming variable with duplicate name " + "`%s' to `%s'."), + name, var_get_name (var)); + } } + rec->var = var; - /* Set the short name the same as the long name. */ - var_set_short_name (var, 0, name); + /* Set the short name the same as the long name (even if we renamed + it). */ + var_set_short_name (var, 0, var_get_name (var)); /* Get variable label, if any. */ if (rec->label) @@ -1538,22 +1536,9 @@ parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format, int *n_warnings) { const int max_warnings = 8; - uint8_t raw_type = format >> 16; - uint8_t w = format >> 8; - uint8_t d = format; struct fmt_spec f; - bool ok; - - f.w = w; - f.d = d; - msg_disable (); - ok = (fmt_from_io (raw_type, &f.type) - && fmt_check_output (&f) - && fmt_check_width_compat (&f, var_get_width (v))); - msg_enable (); - - if (ok) + if (fmt_from_u32 (format, var_get_width (v), false, &f)) { if (which == PRINT_FORMAT) var_set_print_format (v, &f); @@ -1707,9 +1692,9 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, text = open_text_record (r, record, false); for (;;) { - struct sfm_mrset *mrset; - size_t allocated_vars; - char delimiter; + struct sfm_mrset *mrset = NULL; + size_t allocated_vars = 0; + char delimiter = '4'; /* Skip extra line feeds if present. */ while (text_match (text, '\n')) @@ -1756,7 +1741,12 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, } number = text_get_token (text, ss_cstr (" "), NULL); - if (!strcmp (number, "11")) + if (!number) + sys_warn (r, record->pos, + _("Missing label source value " + "following `E' at offset %zu in MRSETS record."), + text_pos (text)); + else if (!strcmp (number, "11")) mrset->label_from_var_label = true; else if (strcmp (number, "1")) sys_warn (r, record->pos, @@ -1827,10 +1817,9 @@ decode_mrsets (struct sfm_reader *r, struct dictionary *dict) size_t i; name = recode_string ("UTF-8", r->encoding, s->name, -1); - if (name[0] != '$') + if (!mrset_is_valid_name (name, dict_get_encoding (dict), false)) { - sys_warn (r, -1, _("Multiple response set name `%s' does not begin " - "with `$'."), + sys_warn (r, -1, _("Invalid multiple response set name `%s'."), name); free (name); continue; @@ -1928,7 +1917,7 @@ parse_display_parameters (struct sfm_reader *r, size_t ofs; size_t i; - n_vars = dict_get_var_cnt (dict); + n_vars = dict_get_n_vars (dict); if (record->count == 3 * n_vars) includes_width = true; else if (record->count == 2 * n_vars) @@ -1992,8 +1981,9 @@ parse_display_parameters (struct sfm_reader *r, } static void -rename_var_and_save_short_names (struct dictionary *dict, struct variable *var, - const char *new_name) +rename_var_and_save_short_names (struct sfm_reader *r, off_t pos, + struct dictionary *dict, + struct variable *var, const char *new_name) { size_t n_short_names; char **short_names; @@ -2002,16 +1992,17 @@ rename_var_and_save_short_names (struct dictionary *dict, struct variable *var, /* Renaming a variable may clear its short names, but we want to retain them, so we save them and re-set them afterward. */ - n_short_names = var_get_short_name_cnt (var); + n_short_names = var_get_n_short_names (var); short_names = xnmalloc (n_short_names, sizeof *short_names); for (i = 0; i < n_short_names; i++) { const char *s = var_get_short_name (var, i); - short_names[i] = s != NULL ? xstrdup (s) : NULL; + short_names[i] = xstrdup_if_nonnull (s); } /* Set long name. */ - dict_rename_var (dict, var, new_name); + if (!dict_try_rename_var (dict, var, new_name)) + sys_warn (r, pos, _("Duplicate long variable name `%s'."), new_name); /* Restore short names. */ for (i = 0; i < n_short_names; i++) @@ -2039,13 +2030,13 @@ parse_long_var_name_map (struct sfm_reader *r, converted to lowercase, as the long variable names. */ size_t i; - for (i = 0; i < dict_get_var_cnt (dict); i++) + for (i = 0; i < dict_get_n_vars (dict); i++) { struct variable *var = dict_get_var (dict, i); char *new_name; new_name = utf8_to_lower (var_get_name (var)); - rename_var_and_save_short_names (dict, var, new_name); + rename_var_and_save_short_names (r, -1, dict, var, new_name); free (new_name); } @@ -2070,16 +2061,7 @@ parse_long_var_name_map (struct sfm_reader *r, continue; } - /* Identify any duplicates. */ - if (utf8_strcasecmp (var_get_short_name (var, 0), long_name) - && dict_lookup_var (dict, long_name) != NULL) - { - sys_warn (r, record->pos, - _("Duplicate long variable name `%s'."), long_name); - continue; - } - - rename_var_and_save_short_names (dict, var, long_name); + rename_var_and_save_short_names (r, record->pos, dict, var, long_name); } close_text_record (r, text); } @@ -2100,7 +2082,6 @@ parse_long_string_map (struct sfm_reader *r, { size_t idx = var_get_dict_index (var); long int length; - int segment_cnt; int i; /* Get length. */ @@ -2115,8 +2096,8 @@ parse_long_string_map (struct sfm_reader *r, } /* Check segments. */ - segment_cnt = sfm_width_to_segments (length); - if (segment_cnt == 1) + int n_segments = sfm_width_to_segments (length); + if (n_segments == 1) { sys_warn (r, record->pos, _("%s listed in very long string record with width %s, " @@ -2124,7 +2105,7 @@ parse_long_string_map (struct sfm_reader *r, var_get_name (var), length_s); continue; } - if (idx + segment_cnt > dict_get_var_cnt (dict)) + if (idx + n_segments > dict_get_n_vars (dict)) { sys_error (r, record->pos, _("Very long string %s overflows dictionary."), @@ -2134,7 +2115,7 @@ parse_long_string_map (struct sfm_reader *r, /* Get the short names from the segments and check their lengths. */ - for (i = 0; i < segment_cnt; i++) + for (i = 0; i < n_segments; i++) { struct variable *seg = dict_get_var (dict, idx + i); int alloc_width = sfm_segment_alloc_width (length, i); @@ -2151,7 +2132,7 @@ parse_long_string_map (struct sfm_reader *r, return false; } } - dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1); + dict_delete_consecutive_vars (dict, idx + 1, n_segments - 1); var_set_width (var, length); } close_text_record (r, text); @@ -2160,61 +2141,99 @@ parse_long_string_map (struct sfm_reader *r, return true; } -static bool -parse_value_labels (struct sfm_reader *r, struct dictionary *dict, - const struct sfm_var_record *var_recs, size_t n_var_recs, - const struct sfm_value_label_record *record) +#define MAX_LABEL_WARNINGS 5 + +/* Displays a warning for offset OFFSET in the file. */ +static void +value_label_warning (struct sfm_reader *r, off_t offset, int *n_label_warnings, + const char *format, ...) { - struct variable **vars; - char **utf8_labels; - size_t i; + if (++*n_label_warnings > MAX_LABEL_WARNINGS) + return; - utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels); - for (i = 0; i < record->n_labels; i++) + va_list args; + + va_start (args, format); + sys_msg (r, offset, MW, format, args); + va_end (args); +} + +#define MAX_LABEL_WARNINGS 5 + +static void +parse_one_value_label_set (struct sfm_reader *r, struct dictionary *dict, + const struct sfm_var_record *var_recs, + size_t n_var_recs, + const struct sfm_value_label_record *record, + int *n_label_warnings) +{ + char **utf8_labels + = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels); + for (size_t i = 0; i < record->n_labels; i++) utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict), record->labels[i].label, -1, r->pool); - vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars); - for (i = 0; i < record->n_vars; i++) + struct variable **vars = pool_nmalloc (r->pool, + record->n_vars, sizeof *vars); + unsigned int n_vars = 0; + for (size_t i = 0; i < record->n_vars; i++) { - vars[i] = lookup_var_by_index (r, record->pos, - var_recs, n_var_recs, record->vars[i]); - if (vars[i] == NULL) - return false; + int idx = record->vars[i]; + if (idx < 1 || idx > n_var_recs) + { + value_label_warning ( + r, record->pos, n_label_warnings, + _("Value label variable index %d not in valid range 1...%zu."), + idx, n_var_recs); + continue; + } + + const struct sfm_var_record *rec = &var_recs[idx - 1]; + if (rec->var == NULL) + { + value_label_warning ( + r, record->pos, n_label_warnings, + _("Value label variable index %d " + "refers to long string continuation."), idx); + continue; + } + + vars[n_vars++] = rec->var; } + if (!n_vars) + return; - for (i = 1; i < record->n_vars; i++) + for (size_t i = 1; i < n_vars; i++) if (var_get_type (vars[i]) != var_get_type (vars[0])) { - sys_error (r, record->pos, - _("Variables associated with value label are not all of " - "identical type. Variable %s is %s, but variable " - "%s is %s."), - var_get_name (vars[0]), - var_is_numeric (vars[0]) ? _("numeric") : _("string"), - var_get_name (vars[i]), - var_is_numeric (vars[i]) ? _("numeric") : _("string")); - return false; + value_label_warning ( + r, record->pos, n_label_warnings, + _("Variables associated with value label are not all of " + "identical type. Variable %s is %s, but variable " + "%s is %s."), + var_get_name (vars[0]), + var_is_numeric (vars[0]) ? _("numeric") : _("string"), + var_get_name (vars[i]), + var_is_numeric (vars[i]) ? _("numeric") : _("string")); + return; } - for (i = 0; i < record->n_vars; i++) + for (size_t i = 0; i < n_vars; i++) { struct variable *var = vars[i]; - int width; - size_t j; - - width = var_get_width (var); + int width = var_get_width (var); if (width > 8) { - sys_error (r, record->pos, - _("Value labels may not be added to long string " - "variables (e.g. %s) using records types 3 and 4."), - var_get_name (var)); - return false; + value_label_warning ( + r, record->pos, n_label_warnings, + _("Value labels may not be added to long string " + "variables (e.g. %s) using records types 3 and 4."), + var_get_name (var)); + continue; } - for (j = 0; j < record->n_labels; j++) + for (size_t j = 0; j < record->n_labels; j++) { struct sfm_value_label *label = &record->labels[j]; union value value; @@ -2223,19 +2242,27 @@ parse_value_labels (struct sfm_reader *r, struct dictionary *dict, if (width == 0) value.f = parse_float (r, label->value, 0); else - memcpy (value_str_rw (&value, width), label->value, width); + memcpy (value.s, label->value, width); if (!var_add_value_label (var, &value, utf8_labels[j])) { - if (var_is_numeric (var)) - sys_warn (r, record->pos, - _("Duplicate value label for %g on %s."), - value.f, var_get_name (var)); + if (r->written_by_readstat) + { + /* Ignore the problem. ReadStat is buggy and emits value + labels whose values are longer than string variables' + widths, that are identical in the actual width of the + variable, e.g. both values "ABC123" and "ABC456" for a + string variable with width 3. */ + } + else if (var_is_numeric (var)) + value_label_warning (r, record->pos, n_label_warnings, + _("Duplicate value label for %g on %s."), + value.f, var_get_name (var)); else - sys_warn (r, record->pos, - _("Duplicate value label for `%.*s' on %s."), - width, value_str (&value, width), - var_get_name (var)); + value_label_warning ( + r, record->pos, n_label_warnings, + _("Duplicate value label for `%.*s' on %s."), + width, value.s, var_get_name (var)); } value_destroy (&value, width); @@ -2243,38 +2270,59 @@ parse_value_labels (struct sfm_reader *r, struct dictionary *dict, } pool_free (r->pool, vars); - for (i = 0; i < record->n_labels; i++) + for (size_t i = 0; i < record->n_labels; i++) pool_free (r->pool, utf8_labels[i]); pool_free (r->pool, utf8_labels); +} - return true; +static void +parse_value_labels (struct sfm_reader *r, struct dictionary *dict) +{ + int n_label_warnings = 0; + for (size_t i = 0; i < r->n_labels; i++) + parse_one_value_label_set (r, dict, r->vars, r->n_vars, &r->labels[i], + &n_label_warnings); + if (n_label_warnings > MAX_LABEL_WARNINGS) + sys_warn (r, -1, + _("Suppressed %d additional warnings for value labels."), + n_label_warnings - MAX_LABEL_WARNINGS); } static struct variable * -lookup_var_by_index (struct sfm_reader *r, off_t offset, - const struct sfm_var_record *var_recs, size_t n_var_recs, - int idx) +parse_weight_var (struct sfm_reader *r, + const struct sfm_var_record *var_recs, size_t n_var_recs, + int idx) { - const struct sfm_var_record *rec; + off_t offset = 76; /* Offset to variable index in header. */ if (idx < 1 || idx > n_var_recs) { - sys_error (r, offset, - _("Variable index %d not in valid range 1...%zu."), - idx, n_var_recs); + sys_warn (r, offset, + _("Weight variable index %d not in valid range 1...%zu. " + "Treating file as unweighted."), + idx, n_var_recs); return NULL; } - rec = &var_recs[idx - 1]; + const struct sfm_var_record *rec = &var_recs[idx - 1]; if (rec->var == NULL) { - sys_error (r, offset, - _("Variable index %d refers to long string continuation."), - idx); + sys_warn (r, offset, + _("Weight variable index %d refers to long string " + "continuation. Treating file as unweighted."), idx); + return NULL; + } + + struct variable *weight_var = rec->var; + if (!var_is_numeric (weight_var)) + { + sys_warn (r, offset, _("Ignoring string variable `%s' set " + "as weighting variable."), + var_get_name (weight_var)); return NULL; } - return rec->var; + return weight_var; } /* Parses a set of custom attributes from TEXT into ATTRS. @@ -2328,8 +2376,15 @@ parse_attributes (struct sfm_reader *r, struct text_record *text, if (text_match (text, ')')) break; } - if (attrs != NULL) - attrset_add (attrs, attr); + if (attrs != NULL && attribute_get_n_values (attr) > 0) + { + if (!attrset_try_add (attrs, attr)) + { + text_warn (r, text, _("Duplicate attribute %s."), + attribute_get_name (attr)); + attribute_destroy (attr); + } + } else attribute_destroy (attr); } @@ -2370,12 +2425,12 @@ assign_variable_roles (struct sfm_reader *r, struct dictionary *dict) size_t n_warnings = 0; size_t i; - for (i = 0; i < dict_get_var_cnt (dict); i++) + for (i = 0; i < dict_get_n_vars (dict); i++) { struct variable *var = dict_get_var (dict, i); struct attrset *attrs = var_get_attributes (var); const struct attribute *attr = attrset_lookup (attrs, "$@Role"); - if (attr != NULL) + if (attr != NULL && attribute_get_n_values (attr) > 0) { int value = atoi (attribute_get_value (attr, 0)); enum var_role role; @@ -2463,7 +2518,8 @@ parse_long_string_value_labels (struct sfm_reader *r, ofs += 4; /* Parse variable name, width, and number of labels. */ - if (!check_overflow (r, record, ofs, var_name_len + 8)) + if (!check_overflow (r, record, ofs, var_name_len) + || !check_overflow (r, record, ofs, var_name_len + 8)) return; var_name = recode_string_pool ("UTF-8", dict_encoding, (const char *) record->data + ofs, @@ -2514,8 +2570,7 @@ parse_long_string_value_labels (struct sfm_reader *r, if (!skip) { if (value_length == width) - memcpy (value_str_rw (&value, width), - (const uint8_t *) record->data + ofs, width); + memcpy (value.s, (const uint8_t *) record->data + ofs, width); else { sys_warn (r, record->pos + ofs, @@ -2547,8 +2602,7 @@ parse_long_string_value_labels (struct sfm_reader *r, if (!var_add_value_label (var, &value, label)) sys_warn (r, record->pos + ofs, _("Duplicate value label for `%.*s' on %s."), - width, value_str (&value, width), - var_get_name (var)); + width, value.s, var_get_name (var)); pool_free (r->pool, label); } ofs += label_length; @@ -2581,7 +2635,8 @@ parse_long_string_missing_values (struct sfm_reader *r, ofs += 4; /* Parse variable name. */ - if (!check_overflow (r, record, ofs, var_name_len + 1)) + if (!check_overflow (r, record, ofs, var_name_len) + || !check_overflow (r, record, ofs, var_name_len + 1)) return; var_name = recode_string_pool ("UTF-8", dict_encoding, (const char *) record->data + ofs, @@ -2667,12 +2722,12 @@ sys_file_casereader_read (struct casereader *reader, void *r_) int retval; int i; - if (r->error || !r->sfm_var_cnt) + if (r->error || !r->sfm_n_vars) return NULL; c = case_create (r->proto); - for (i = 0; i < r->sfm_var_cnt; i++) + for (i = 0; i < r->sfm_n_vars; i++) { struct sfm_var *sv = &r->sfm_vars[i]; union value *v = case_data_rw_idx (c, sv->case_index); @@ -2681,8 +2736,7 @@ sys_file_casereader_read (struct casereader *reader, void *r_) retval = read_case_number (r, &v->f); else { - uint8_t *s = value_str_rw (v, sv->var_width); - retval = read_case_string (r, s + sv->offset, sv->segment_width); + retval = read_case_string (r, v->s + sv->offset, sv->segment_width); if (retval == 1) { retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)); @@ -2699,7 +2753,7 @@ sys_file_casereader_read (struct casereader *reader, void *r_) eof: if (i != 0) partial_record (r); - if (r->case_cnt != -1) + if (r->n_cases != -1) read_error (reader, r); case_unref (c); return NULL; @@ -2972,7 +3026,7 @@ open_text_record (struct sfm_reader *r, } /* Closes TEXT, frees its storage, and issues a final warning - about suppressed warnings if necesary. */ + about suppressed warnings if necessary. */ static void close_text_record (struct sfm_reader *r, struct text_record *text) { @@ -3069,7 +3123,11 @@ text_get_token (struct text_record *text, struct substring delimiters, char *end; if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token)) - return NULL; + { + if (delimiter != NULL) + *delimiter = ss_data (text->buffer)[text->pos-1]; + return NULL; + } end = &ss_data (token)[ss_length (token)]; if (delimiter != NULL) @@ -3173,7 +3231,6 @@ static void sys_msg (struct sfm_reader *r, off_t offset, int class, const char *format, va_list args) { - struct msg m; struct string text; ds_init_empty (&text); @@ -3184,16 +3241,13 @@ sys_msg (struct sfm_reader *r, off_t offset, ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh)); ds_put_vformat (&text, format, args); - m.category = msg_class_to_category (class); - m.severity = msg_class_to_severity (class); - m.file_name = NULL; - m.first_line = 0; - m.last_line = 0; - m.first_column = 0; - m.last_column = 0; - m.text = ds_cstr (&text); - - msg_emit (&m); + struct msg *m = xmalloc (sizeof *m); + *m = (struct msg) { + .category = msg_class_to_category (class), + .severity = msg_class_to_severity (class), + .text = ds_steal_cstr (&text), + }; + msg_emit (m); } /* Displays a warning for offset OFFSET in the file. */ @@ -3228,11 +3282,11 @@ sys_error (struct sfm_reader *r, off_t offset, const char *format, ...) an error. */ static inline int read_bytes_internal (struct sfm_reader *r, bool eof_is_ok, - void *buf, size_t byte_cnt) + void *buf, size_t n_bytes) { - size_t bytes_read = fread (buf, 1, byte_cnt, r->file); + size_t bytes_read = fread (buf, 1, n_bytes, r->file); r->pos += bytes_read; - if (bytes_read == byte_cnt) + if (bytes_read == n_bytes) return 1; else if (ferror (r->file)) { @@ -3252,9 +3306,9 @@ read_bytes_internal (struct sfm_reader *r, bool eof_is_ok, Returns true if successful. Returns false upon I/O error or if end-of-file is encountered. */ static bool -read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { - return read_bytes_internal (r, false, buf, byte_cnt) == 1; + return read_bytes_internal (r, false, buf, n_bytes) == 1; } /* Reads BYTE_CNT bytes into BUF. @@ -3262,9 +3316,9 @@ read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) Returns 0 if an immediate end-of-file is encountered. Returns -1 if an I/O error or a partial read occurs. */ static int -try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +try_read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { - return read_bytes_internal (r, true, buf, byte_cnt); + return read_bytes_internal (r, true, buf, n_bytes); } /* Reads a 32-bit signed integer from R and stores its value in host format in @@ -3489,7 +3543,7 @@ read_ztrailer (struct sfm_reader *r, if (fstat (fileno (r->file), &s)) { - sys_error (ME, 0, _("%s: stat failed (%s)."), + sys_error (r, 0, _("%s: stat failed (%s)."), fh_get_file_name (r->fh), strerror (errno)); return false; } @@ -3655,11 +3709,11 @@ close_zstream (struct sfm_reader *r) } static int -read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) +read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t n_bytes) { uint8_t *buf = buf_; - if (byte_cnt == 0) + if (n_bytes == 0) return 1; for (;;) @@ -3669,13 +3723,13 @@ read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) /* Use already inflated data if there is any. */ if (r->zout_pos < r->zout_end) { - unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos); + unsigned int n = MIN (n_bytes, r->zout_end - r->zout_pos); memcpy (buf, &r->zout_buf[r->zout_pos], n); r->zout_pos += n; - byte_cnt -= n; + n_bytes -= n; buf += n; - if (byte_cnt == 0) + if (n_bytes == 0) return 1; } @@ -3722,13 +3776,13 @@ read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) } static int -read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { if (r->compression == ANY_COMP_SIMPLE) - return read_bytes (r, buf, byte_cnt); + return read_bytes (r, buf, n_bytes); else { - int retval = read_bytes_zlib (r, buf, byte_cnt); + int retval = read_bytes_zlib (r, buf, n_bytes); if (retval == 0) sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data.")); return retval; @@ -3736,12 +3790,12 @@ read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) } static int -try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { if (r->compression == ANY_COMP_SIMPLE) - return try_read_bytes (r, buf, byte_cnt); + return try_read_bytes (r, buf, n_bytes); else - return read_bytes_zlib (r, buf, byte_cnt); + return read_bytes_zlib (r, buf, n_bytes); } /* Reads a 64-bit floating-point number from R and returns its