X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=f63a122fe83b96776b632cef57b51e49c83c4bd1;hb=3bbb4370239deb29ebbf813d258aef6249e2a431;hp=cff462ac143df5184aeeec231d1ad388ef278cde;hpb=6b562f8a8263930b8d1ed1862efec76f2511ed08;p=pspp-builds.git diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index cff462ac..f63a122f 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -87,6 +88,7 @@ struct sfm_reader double bias; /* Compression bias, usually 100.0. */ uint8_t opcodes[8]; /* Current block of opcodes. */ size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ + bool corruption_warning; /* Warned about possible corruption? */ }; static const struct casereader_class sys_file_casereader_class; @@ -186,6 +188,62 @@ static void read_long_string_value_labels (struct sfm_reader *, size_t size, size_t count, struct dictionary *); +/* Convert all the strings in DICT from the dict encoding to UTF8 */ +static void +recode_strings (struct dictionary *dict) +{ + int i; + + const char *enc = dict_get_encoding (dict); + + if ( NULL == enc) + enc = get_default_encoding (); + + for (i = 0 ; i < dict_get_var_cnt (dict); ++i) + { + /* Convert the long variable name */ + struct variable *var = dict_get_var (dict, i); + const char *native_name = var_get_name (var); + char *utf8_name = recode_string (UTF8, enc, native_name, -1); + if ( 0 != strcmp (utf8_name, native_name)) + { + if ( NULL == dict_lookup_var (dict, utf8_name)) + dict_rename_var (dict, var, utf8_name); + else + msg (MW, + _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name); + } + + free (utf8_name); + + /* Convert the variable label */ + if (var_has_label (var)) + { + char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1); + var_set_label (var, utf8_label); + free (utf8_label); + } + + if (var_has_value_labels (var)) + { + const struct val_lab *vl = NULL; + const struct val_labs *vlabs = var_get_value_labels (var); + + for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl)) + { + const union value *val = val_lab_get_value (vl); + const char *label = val_lab_get_label (vl); + char *new_label = NULL; + + new_label = recode_string (UTF8, enc, label, -1); + + var_replace_value_label (var, val, new_label); + free (new_label); + } + } + } +} + /* Opens the system file designated by file handle FH for reading. Reads the system file's dictionary into *DICT. If INFO is non-null, then it receives additional info about the @@ -213,6 +271,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, r->oct_cnt = 0; r->has_long_var_names = false; r->opcode_idx = sizeof r->opcodes; + r->corruption_warning = false; /* TRANSLATORS: this fragment will be interpolated into messages in fh_lock() that identify types of files. */ @@ -303,6 +362,8 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, r->has_long_var_names = true; } + recode_strings (*dict); + /* Read record 999 data, which is just filler. */ read_int (r); @@ -446,9 +507,21 @@ read_header (struct sfm_reader *r, struct dictionary *dict, read_bytes (r, raw_bias, sizeof raw_bias); if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0) { - sys_warn (r, _("Compression bias is not the usual " - "value of 100, or system file uses unrecognized " - "floating-point format.")); + uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (memcmp (raw_bias, zero_bias, 8)) + sys_warn (r, _("Compression bias is not the usual " + "value of 100, or system file uses unrecognized " + "floating-point format.")); + else + { + /* Some software is known to write all-zeros to this + field. Such software also writes floating-point + numbers in the format that we expect by default + (it seems that all software most likely does, in + reality), so don't warn in this case. */ + } + if (r->integer_format == INTEGER_MSB_FIRST) r->float_format = FLOAT_IEEE_DOUBLE_BE; else @@ -518,7 +591,7 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, /* Create variable. */ if (width < 0 || width > 255) - sys_error (r, _("Bad variable width %d."), width); + sys_error (r, _("Bad width %d for variable %s."), width, name); var = dict_create_var (dict, name, width); if (var == NULL) sys_error (r, @@ -552,7 +625,7 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, struct missing_values mv; int i; - mv_init (&mv, var_get_width (var)); + mv_init_pool (r->pool, &mv, var_get_width (var)); if (var_is_numeric (var)) { if (missing_value_code < -3 || missing_value_code > 3 @@ -571,21 +644,24 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, } else { + int mv_width = MAX (width, 8); + union value value; + if (missing_value_code < 1 || missing_value_code > 3) sys_error (r, _("String missing value indicator field is not " "0, 1, 2, or 3.")); - if (var_is_long_string (var)) - sys_warn (r, _("Ignoring missing values on long string variable " - "%s, which PSPP does not yet support."), name); + + value_init (&value, mv_width); + value_set_missing (&value, mv_width); for (i = 0; i < missing_value_code; i++) { - char string[9]; - read_string (r, string, sizeof string); - mv_add_str (&mv, string); + uint8_t *s = value_str_rw (&value, mv_width); + read_bytes (r, s, 8); + mv_add_str (&mv, s); } + value_destroy (&value, mv_width); } - if (!var_is_long_string (var)) - var_set_missing_values (var, &mv); + var_set_missing_values (var, &mv); } /* Set formats. */ @@ -784,7 +860,7 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict, /* New in SPSS 16. Contains a single string that describes the character encoding, e.g. "windows-1252". */ { - char *encoding = xcalloc (size, count + 1); + char *encoding = pool_calloc (r->pool, size, count + 1); read_string (r, encoding, count + 1); dict_set_encoding (dict, encoding); return; @@ -857,7 +933,7 @@ read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count, NOT_REACHED (); if (integer_representation != expected_integer_format) { - static const char *const endian[] = {N_("little-endian"), N_("big-endian")}; + static const char *const endian[] = {N_("Little Endian"), N_("Big Endian")}; sys_warn (r, _("Integer format indicated by system file (%s) " "differs from expected (%s)."), gettext (endian[integer_representation == 1]), @@ -919,11 +995,16 @@ read_machine_float_info (struct sfm_reader *r, size_t size, size_t count) size, count); if (sysmis != SYSMIS) - sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis); + sys_warn (r, _("File specifies unexpected value %g as %s."), + sysmis, "SYSMIS"); + if (highest != HIGHEST) - sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest); + sys_warn (r, _("File specifies unexpected value %g as %s."), + highest, "HIGHEST"); + if (lowest != LOWEST) - sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest); + sys_warn (r, _("File specifies unexpected value %g as %s."), + lowest, "LOWEST"); } /* Read record type 7, subtype 11, which specifies how variables @@ -1130,7 +1211,7 @@ read_value_labels (struct sfm_reader *r, struct label { - char raw_value[8]; /* Value as uninterpreted bytes. */ + uint8_t raw_value[8]; /* Value as uninterpreted bytes. */ union value value; /* Value. */ char *label; /* Null-terminated label string. */ }; @@ -1203,7 +1284,7 @@ read_value_labels (struct sfm_reader *r, for (i = 0; i < var_cnt; i++) { var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r)); - if (var_is_long_string (var[i])) + if (var_get_width (var[i]) > 8) sys_error (r, _("Value labels may not be added to long string " "variables (e.g. %s) using records types 3 and 4."), var_get_name (var[i])); @@ -1228,7 +1309,7 @@ read_value_labels (struct sfm_reader *r, value_init_pool (subpool, &label->value, max_width); if (var_is_alpha (var[0])) - buf_copy_rpad (value_str_rw (&label->value, max_width), max_width, + u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width, label->raw_value, sizeof label->raw_value, ' '); else label->value.f = float_get_double (r->float_format, label->raw_value); @@ -1408,7 +1489,7 @@ read_long_string_value_labels (struct sfm_reader *r, /* Read value. */ value_length = read_int (r); if (value_length == width) - read_string (r, value_str_rw (&value, width), width + 1); + read_bytes (r, value_str_rw (&value, width), width); else { sys_warn (r, _("Ignoring long string value %zu for variable %s, " @@ -1465,11 +1546,11 @@ static void partial_record (struct sfm_reader *r) static void read_error (struct casereader *, const struct sfm_reader *); static bool read_case_number (struct sfm_reader *, double *); -static bool read_case_string (struct sfm_reader *, char *, size_t); +static bool read_case_string (struct sfm_reader *, uint8_t *, size_t); static int read_opcode (struct sfm_reader *); static bool read_compressed_number (struct sfm_reader *, double *); -static bool read_compressed_string (struct sfm_reader *, char *); -static bool read_whole_strings (struct sfm_reader *, char *, size_t); +static bool read_compressed_string (struct sfm_reader *, uint8_t *); +static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t); static bool skip_whole_strings (struct sfm_reader *, size_t); /* Reads and returns one case from READER's file. Returns a null @@ -1504,7 +1585,7 @@ sys_file_casereader_read (struct casereader *reader, void *r_) } else { - char *s = value_str_rw (v, sv->var_width); + uint8_t *s = value_str_rw (v, sv->var_width); if (!read_case_string (r, s + sv->offset, sv->segment_width)) goto eof; if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8))) @@ -1566,7 +1647,7 @@ read_case_number (struct sfm_reader *r, double *d) Returns true if successful, false if end of file is reached immediately. */ static bool -read_case_string (struct sfm_reader *r, char *s, size_t length) +read_case_string (struct sfm_reader *r, uint8_t *s, size_t length) { size_t whole = ROUND_DOWN (length, 8); size_t partial = length % 8; @@ -1579,7 +1660,7 @@ read_case_string (struct sfm_reader *r, char *s, size_t length) if (partial) { - char bounce[8]; + uint8_t bounce[8]; if (!read_whole_strings (r, bounce, sizeof bounce)) { if (whole) @@ -1631,7 +1712,14 @@ read_compressed_number (struct sfm_reader *r, double *d) break; case 254: - sys_error (r, _("Compressed data is corrupt.")); + float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d); + if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, _("Possible compressed data corruption: " + "compressed spaces appear in numeric field.")); + } + break; case 255: *d = SYSMIS; @@ -1650,9 +1738,10 @@ read_compressed_number (struct sfm_reader *r, double *d) Returns true if successful, false if end of file is reached immediately. */ static bool -read_compressed_string (struct sfm_reader *r, char *dst) +read_compressed_string (struct sfm_reader *r, uint8_t *dst) { - switch (read_opcode (r)) + int opcode = read_opcode (r); + switch (opcode) { case -1: case 252: @@ -1667,7 +1756,25 @@ read_compressed_string (struct sfm_reader *r, char *dst) break; default: - sys_error (r, _("Compressed data is corrupt.")); + { + double value = opcode - r->bias; + float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst); + if (value == 0.0) + { + /* This has actually been seen "in the wild". The submitter of the + file that showed that the contents decoded as spaces, but they + were at the end of the field so it's possible that the null + bytes just acted as null terminators. */ + } + else if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, _("Possible compressed data corruption: " + "string contains compressed integer (opcode %d)"), + opcode); + } + } + break; } return true; @@ -1679,7 +1786,7 @@ read_compressed_string (struct sfm_reader *r, char *dst) Returns true if successful, false if end of file is reached immediately. */ static bool -read_whole_strings (struct sfm_reader *r, char *s, size_t length) +read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length) { assert (length % 8 == 0); if (!r->compressed) @@ -1707,7 +1814,7 @@ read_whole_strings (struct sfm_reader *r, char *s, size_t length) static bool skip_whole_strings (struct sfm_reader *r, size_t length) { - char buffer[1024]; + uint8_t buffer[1024]; assert (length < sizeof buffer); return read_whole_strings (r, buffer, length); }