X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=491df7f996c349a853c3492d0a49ca575e717161;hb=36bba0ffbec3b8432d4ececb720bf033053f5d46;hp=109b0dd16efd0d980a5a0e16225ae77a57e3650d;hpb=8f7af0acaf8a9253242d89fcdb26e285841f7833;p=pspp diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 109b0dd16e..491df7f996 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -41,6 +41,7 @@ #include "data/value-labels.h" #include "data/value.h" #include "data/variable.h" +#include "data/varset.h" #include "libpspp/array.h" #include "libpspp/assertion.h" #include "libpspp/compiler.h" @@ -200,8 +201,8 @@ struct sfm_reader enum integer_format integer_format; /* On-disk integer format. */ enum float_format float_format; /* On-disk floating point format. */ struct sfm_var *sfm_vars; /* Variables. */ - size_t sfm_var_cnt; /* Number of variables. */ - int case_cnt; /* Number of cases */ + size_t sfm_n_vars; /* Number of variables. */ + int n_cases; /* Number of cases */ const char *encoding; /* String encoding. */ bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */ @@ -331,7 +332,7 @@ static bool parse_variable_records (struct sfm_reader *, struct dictionary *, struct sfm_var_record *, size_t n); static void parse_format_spec (struct sfm_reader *, off_t pos, unsigned int format, enum which_format, - struct variable *, int *format_warning_cnt); + struct variable *, int *format_n_warnings); static void parse_document (struct dictionary *, struct sfm_document_record *); static void parse_display_parameters (struct sfm_reader *, const struct sfm_extension_record *, @@ -371,6 +372,9 @@ static void parse_long_string_value_labels (struct sfm_reader *, static void parse_long_string_missing_values ( struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); +static void parse_var_sets (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); /* Frees the strings inside INFO. */ void @@ -744,7 +748,7 @@ sfm_get_strings (const struct any_reader *r_, struct pool *pool, return aux.n; } -/* Decodes the dictionary read from R, saving it into into *DICT. Character +/* Decodes the dictionary read from R, saving it into *DICT. Character strings in R are decoded using ENCODING, or an encoding obtained from R if ENCODING is null, or the locale encoding if R specifies no encoding. @@ -840,6 +844,8 @@ sfm_decode (struct any_reader *r_, const char *encoding, if (r->extensions[EXT_LONG_MISSING] != NULL) parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING], dict); + if (r->extensions[EXT_VAR_SETS]) + parse_var_sets (r, r->extensions[EXT_VAR_SETS], dict); /* Warn if the actual amount of data per case differs from the amount that the header claims. SPSS version 13 gets this @@ -856,7 +862,7 @@ sfm_decode (struct any_reader *r_, const char *encoding, sfm_read_case to use. We cannot use the `struct variable's from the dictionary we created, because the caller owns the dictionary and may destroy or modify its variables. */ - sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt); + sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_n_vars); pool_register (r->pool, free, r->sfm_vars); r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool); @@ -868,9 +874,8 @@ sfm_decode (struct any_reader *r_, const char *encoding, } return casereader_create_sequential - (NULL, r->proto, - r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt, - &sys_file_casereader_class, r); + (NULL, r->proto, r->n_cases == -1 ? CASENUMBER_MAX : r->n_cases, + &sys_file_casereader_class, r); error: sfm_close (r_); @@ -1016,10 +1021,10 @@ read_header (struct sfm_reader *r, struct any_read_info *info, if (!read_int (r, &header->weight_idx)) return false; - if (!read_int (r, &r->case_cnt)) + if (!read_int (r, &r->n_cases)) return false; - if (r->case_cnt > INT_MAX / 2) - r->case_cnt = -1; + if (r->n_cases > INT_MAX / 2) + r->n_cases = -1; /* Identify floating-point format and obtain compression bias. */ if (!read_bytes (r, raw_bias, sizeof raw_bias)) @@ -1058,7 +1063,7 @@ read_header (struct sfm_reader *r, struct any_read_info *info, info->integer_format = r->integer_format; info->float_format = r->float_format; info->compression = r->compression; - info->case_cnt = r->case_cnt; + info->n_cases = r->n_cases; return true; } @@ -1285,6 +1290,7 @@ read_extension_record (struct sfm_reader *r, int subtype, /* Implemented record types. */ { EXT_INTEGER, 4, 8 }, { EXT_FLOAT, 8, 3 }, + { EXT_VAR_SETS, 1, 0 }, { EXT_MRSETS, 1, 0 }, { EXT_PRODUCT_INFO, 1, 0 }, { EXT_DISPLAY, 4, 0 }, @@ -1299,7 +1305,6 @@ read_extension_record (struct sfm_reader *r, int subtype, { EXT_LONG_MISSING, 1, 0 }, /* Ignored record types. */ - { EXT_VAR_SETS, 0, 0 }, { EXT_DATE, 0, 0 }, { EXT_DATA_ENTRY, 0, 0 }, { EXT_DATAVIEW, 0, 0 }, @@ -1434,8 +1439,7 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, } struct variable *var; - if (!dict_id_is_valid (dict, name, false) - || name[0] == '$' || name[0] == '#') + if (!dict_id_is_valid (dict, name) || name[0] == '$' || name[0] == '#') { var = add_var_with_generated_name (dict, rec->width); sys_warn (r, rec->pos, _("Renaming variable with invalid name " @@ -1542,9 +1546,9 @@ parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format, if (fmt_from_u32 (format, var_get_width (v), false, &f)) { if (which == PRINT_FORMAT) - var_set_print_format (v, &f); + var_set_print_format (v, f); else - var_set_write_format (v, &f); + var_set_write_format (v, f); } else if (format == 0) { @@ -1818,7 +1822,7 @@ decode_mrsets (struct sfm_reader *r, struct dictionary *dict) size_t i; name = recode_string ("UTF-8", r->encoding, s->name, -1); - if (!mrset_is_valid_name (name, dict_get_encoding (dict), false)) + if (!mrset_is_valid_name (name, dict_get_encoding (dict))) { sys_warn (r, -1, _("Invalid multiple response set name `%s'."), name); @@ -1918,7 +1922,7 @@ parse_display_parameters (struct sfm_reader *r, size_t ofs; size_t i; - n_vars = dict_get_var_cnt (dict); + n_vars = dict_get_n_vars (dict); if (record->count == 3 * n_vars) includes_width = true; else if (record->count == 2 * n_vars) @@ -1993,7 +1997,7 @@ rename_var_and_save_short_names (struct sfm_reader *r, off_t pos, /* Renaming a variable may clear its short names, but we want to retain them, so we save them and re-set them afterward. */ - n_short_names = var_get_short_name_cnt (var); + n_short_names = var_get_n_short_names (var); short_names = xnmalloc (n_short_names, sizeof *short_names); for (i = 0; i < n_short_names; i++) { @@ -2031,7 +2035,7 @@ parse_long_var_name_map (struct sfm_reader *r, converted to lowercase, as the long variable names. */ size_t i; - for (i = 0; i < dict_get_var_cnt (dict); i++) + for (i = 0; i < dict_get_n_vars (dict); i++) { struct variable *var = dict_get_var (dict, i); char *new_name; @@ -2052,7 +2056,7 @@ parse_long_var_name_map (struct sfm_reader *r, while (read_variable_to_value_pair (r, dict, text, &var, &long_name)) { /* Validate long name. */ - if (!dict_id_is_valid (dict, long_name, false) + if (!dict_id_is_valid (dict, long_name) || long_name[0] == '$' || long_name[0] == '#') { sys_warn (r, record->pos, @@ -2083,7 +2087,6 @@ parse_long_string_map (struct sfm_reader *r, { size_t idx = var_get_dict_index (var); long int length; - int segment_cnt; int i; /* Get length. */ @@ -2098,8 +2101,8 @@ parse_long_string_map (struct sfm_reader *r, } /* Check segments. */ - segment_cnt = sfm_width_to_segments (length); - if (segment_cnt == 1) + int n_segments = sfm_width_to_segments (length); + if (n_segments == 1) { sys_warn (r, record->pos, _("%s listed in very long string record with width %s, " @@ -2107,7 +2110,7 @@ parse_long_string_map (struct sfm_reader *r, var_get_name (var), length_s); continue; } - if (idx + segment_cnt > dict_get_var_cnt (dict)) + if (idx + n_segments > dict_get_n_vars (dict)) { sys_error (r, record->pos, _("Very long string %s overflows dictionary."), @@ -2117,7 +2120,7 @@ parse_long_string_map (struct sfm_reader *r, /* Get the short names from the segments and check their lengths. */ - for (i = 0; i < segment_cnt; i++) + for (i = 0; i < n_segments; i++) { struct variable *seg = dict_get_var (dict, idx + i); int alloc_width = sfm_segment_alloc_width (length, i); @@ -2134,11 +2137,10 @@ parse_long_string_map (struct sfm_reader *r, return false; } } - dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1); + dict_delete_consecutive_vars (dict, idx + 1, n_segments - 1); var_set_width (var, length); } close_text_record (r, text); - dict_compact_values (dict); return true; } @@ -2427,7 +2429,7 @@ assign_variable_roles (struct sfm_reader *r, struct dictionary *dict) size_t n_warnings = 0; size_t i; - for (i = 0; i < dict_get_var_cnt (dict); i++) + for (i = 0; i < dict_get_n_vars (dict); i++) { struct variable *var = dict_get_var (dict, i); struct attrset *attrs = var_get_attributes (var); @@ -2480,21 +2482,28 @@ assign_variable_roles (struct sfm_reader *r, struct dictionary *dict) } static bool -check_overflow (struct sfm_reader *r, - const struct sfm_extension_record *record, - size_t ofs, size_t length) +check_overflow__ (const struct sfm_extension_record *record, + size_t ofs, size_t length) { size_t end = record->size * record->count; if (length >= end || ofs + length > end) - { - sys_warn (r, record->pos + end, - _("Extension record subtype %d ends unexpectedly."), - record->subtype); - return false; - } + return false; return true; } +static bool +check_overflow (struct sfm_reader *r, + const struct sfm_extension_record *record, + size_t ofs, size_t length) +{ + bool ok = check_overflow__ (record, ofs, length); + if (!ok) + sys_warn (r, record->pos + record->size * record->count, + _("Extension record subtype %d ends unexpectedly."), + record->subtype); + return ok; +} + static void parse_long_string_value_labels (struct sfm_reader *r, const struct sfm_extension_record *record, @@ -2621,6 +2630,7 @@ parse_long_string_missing_values (struct sfm_reader *r, size_t end = record->size * record->count; size_t ofs = 0; + bool warned = false; while (ofs < end) { struct missing_values mv; @@ -2669,17 +2679,32 @@ parse_long_string_missing_values (struct sfm_reader *r, var = NULL; } + /* Parse value length. */ + if (!check_overflow (r, record, ofs, 4)) + return; + size_t value_length = parse_int (r, record->data, ofs); + ofs += 4; + /* Parse values. */ mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8); for (i = 0; i < n_missing_values; i++) { - size_t value_length; - - /* Parse value length. */ - if (!check_overflow (r, record, ofs, 4)) - return; - value_length = parse_int (r, record->data, ofs); - ofs += 4; + /* Tolerate files written by old, buggy versions of PSPP where we + believed that the value_length was repeated before each missing + value. */ + if (check_overflow__ (record, ofs, value_length) + && parse_int (r, record->data, ofs) == 8) + { + if (!warned) + { + sys_warn (r, record->pos + ofs, + _("This file has corrupted metadata written by a " + "buggy version of PSPP. To fix it, save a new " + "copy of the file.")); + warned = true; + } + ofs += 4; + } /* Parse value. */ if (!check_overflow (r, record, ofs, value_length)) @@ -2699,6 +2724,57 @@ parse_long_string_missing_values (struct sfm_reader *r, var_set_missing_values (var, &mv); } } + +static void +parse_var_sets (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text = open_text_record (r, record, true); + for (;;) + { + char *varset_name = text_get_token (text, ss_cstr ("="), NULL); + if (!varset_name) + break; + + struct varset *varset = xmalloc (sizeof *varset); + *varset = (struct varset) { + .name = xstrdup (varset_name), + }; + + text_match (text, ' '); + + size_t allocated_vars = 0; + char delimiter; + do + { + char *var_name = text_get_token (text, ss_cstr (" \n"), &delimiter); + if (!var_name) + break; + + size_t len = strlen (var_name); + if (len > 0 && var_name[len - 1] == '\r') + var_name[len - 1] = '\0'; + + struct variable *var = dict_lookup_var (dict, var_name); + if (var) + { + if (varset->n_vars >= allocated_vars) + varset->vars = x2nrealloc (varset->vars, &allocated_vars, + sizeof *varset->vars); + varset->vars[varset->n_vars++] = var; + } + else + sys_warn (r, record->pos, + _("Variable set %s contains unknown variable %s."), + varset_name, var_name); + } + while (delimiter == ' '); + + dict_add_varset (dict, varset); + } + close_text_record (r, text); +} /* Case reader. */ @@ -2724,12 +2800,12 @@ sys_file_casereader_read (struct casereader *reader, void *r_) int retval; int i; - if (r->error || !r->sfm_var_cnt) + if (r->error || !r->sfm_n_vars) return NULL; c = case_create (r->proto); - for (i = 0; i < r->sfm_var_cnt; i++) + for (i = 0; i < r->sfm_n_vars; i++) { struct sfm_var *sv = &r->sfm_vars[i]; union value *v = case_data_rw_idx (c, sv->case_index); @@ -2755,7 +2831,7 @@ sys_file_casereader_read (struct casereader *reader, void *r_) eof: if (i != 0) partial_record (r); - if (r->case_cnt != -1) + if (r->n_cases != -1) read_error (reader, r); case_unref (c); return NULL; @@ -3284,11 +3360,11 @@ sys_error (struct sfm_reader *r, off_t offset, const char *format, ...) an error. */ static inline int read_bytes_internal (struct sfm_reader *r, bool eof_is_ok, - void *buf, size_t byte_cnt) + void *buf, size_t n_bytes) { - size_t bytes_read = fread (buf, 1, byte_cnt, r->file); + size_t bytes_read = fread (buf, 1, n_bytes, r->file); r->pos += bytes_read; - if (bytes_read == byte_cnt) + if (bytes_read == n_bytes) return 1; else if (ferror (r->file)) { @@ -3308,9 +3384,9 @@ read_bytes_internal (struct sfm_reader *r, bool eof_is_ok, Returns true if successful. Returns false upon I/O error or if end-of-file is encountered. */ static bool -read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { - return read_bytes_internal (r, false, buf, byte_cnt) == 1; + return read_bytes_internal (r, false, buf, n_bytes) == 1; } /* Reads BYTE_CNT bytes into BUF. @@ -3318,9 +3394,9 @@ read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) Returns 0 if an immediate end-of-file is encountered. Returns -1 if an I/O error or a partial read occurs. */ static int -try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +try_read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { - return read_bytes_internal (r, true, buf, byte_cnt); + return read_bytes_internal (r, true, buf, n_bytes); } /* Reads a 32-bit signed integer from R and stores its value in host format in @@ -3711,11 +3787,11 @@ close_zstream (struct sfm_reader *r) } static int -read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) +read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t n_bytes) { uint8_t *buf = buf_; - if (byte_cnt == 0) + if (n_bytes == 0) return 1; for (;;) @@ -3725,13 +3801,13 @@ read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) /* Use already inflated data if there is any. */ if (r->zout_pos < r->zout_end) { - unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos); + unsigned int n = MIN (n_bytes, r->zout_end - r->zout_pos); memcpy (buf, &r->zout_buf[r->zout_pos], n); r->zout_pos += n; - byte_cnt -= n; + n_bytes -= n; buf += n; - if (byte_cnt == 0) + if (n_bytes == 0) return 1; } @@ -3778,13 +3854,13 @@ read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) } static int -read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { if (r->compression == ANY_COMP_SIMPLE) - return read_bytes (r, buf, byte_cnt); + return read_bytes (r, buf, n_bytes); else { - int retval = read_bytes_zlib (r, buf, byte_cnt); + int retval = read_bytes_zlib (r, buf, n_bytes); if (retval == 0) sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data.")); return retval; @@ -3792,12 +3868,12 @@ read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) } static int -try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes) { if (r->compression == ANY_COMP_SIMPLE) - return try_read_bytes (r, buf, byte_cnt); + return try_read_bytes (r, buf, n_bytes); else - return read_bytes_zlib (r, buf, byte_cnt); + return read_bytes_zlib (r, buf, n_bytes); } /* Reads a 64-bit floating-point number from R and returns its