X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=5a8402791464b663f596596b0675098d02202370;hb=17c302ddd46683b6251f52e1d6dd6de5099d5abb;hp=c80bd5f557246cead2e3cc0feeb743834a48d0f5;hpb=c3ac5a8af9c449072c7e872ca70a78c1755ae309;p=pspp-builds.git diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index c80bd5f5..5a840279 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -71,7 +71,7 @@ struct sfm_reader struct fh_lock *lock; /* Mutual exclusion for file handle. */ FILE *file; /* File stream. */ bool error; /* I/O or corruption error? */ - size_t value_cnt; /* Number of "union value"s in struct case. */ + struct caseproto *proto; /* Format of output cases. */ /* File format. */ enum integer_format integer_format; /* On-disk integer format. */ @@ -162,7 +162,9 @@ static void read_extension_record (struct sfm_reader *, struct dictionary *, struct sfm_read_info *); static void read_machine_integer_info (struct sfm_reader *, size_t size, size_t count, - struct sfm_read_info *); + struct sfm_read_info *, + struct dictionary * + ); static void read_machine_float_info (struct sfm_reader *, size_t size, size_t count); static void read_display_parameters (struct sfm_reader *, @@ -180,6 +182,9 @@ static void read_data_file_attributes (struct sfm_reader *, static void read_variable_attributes (struct sfm_reader *, size_t size, size_t count, struct dictionary *); +static void read_long_string_value_labels (struct sfm_reader *, + size_t size, size_t count, + struct dictionary *); /* Opens the system file designated by file handle FH for reading. Reads the system file's dictionary into *DICT. @@ -317,11 +322,11 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, dictionary and may destroy or modify its variables. */ sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt); pool_register (r->pool, free, r->sfm_vars); + r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool); pool_free (r->pool, var_by_value_idx); - r->value_cnt = dict_get_next_value_idx (*dict); return casereader_create_sequential - (NULL, r->value_cnt, + (NULL, r->proto, r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt, &sys_file_casereader_class, r); @@ -725,7 +730,7 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict, switch (subtype) { case 3: - read_machine_integer_info (r, size, count, info); + read_machine_integer_info (r, size, count, info, dict); return; case 4: @@ -744,7 +749,11 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict, break; case 7: - /* Unknown purpose. */ + /* Used by the MRSETS command. */ + break; + + case 8: + /* Used by the SPSS Data Entry software. */ break; case 11: @@ -774,17 +783,22 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict, case 20: /* New in SPSS 16. Contains a single string that describes the character encoding, e.g. "windows-1252". */ - break; + { + char *encoding = pool_calloc (r->pool, size, count + 1); + read_string (r, encoding, count + 1); + dict_set_encoding (dict, encoding); + return; + } case 21: /* New in SPSS 16. Encodes value labels for long string variables. */ - sys_warn (r, _("Ignoring value labels for long string variables, " - "which PSPP does not yet support.")); - break; + read_long_string_value_labels (r, size, count, dict); + return; default: - sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype); + sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"), + subtype, PACKAGE_BUGREPORT); break; } @@ -794,7 +808,8 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict, /* Read record type 7, subtype 3. */ static void read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count, - struct sfm_read_info *info) + struct sfm_read_info *info, + struct dictionary *dict) { int version_major = read_int (r); int version_minor = read_int (r); @@ -803,7 +818,7 @@ read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count, int float_representation = read_int (r); int compression_code UNUSED = read_int (r); int integer_representation = read_int (r); - int character_code UNUSED = read_int (r); + int character_code = read_int (r); int expected_float_format; int expected_integer_format; @@ -848,6 +863,47 @@ read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count, gettext (endian[integer_representation == 1]), gettext (endian[expected_integer_format == 1])); } + + + /* + Record 7 (20) provides a much more reliable way of + setting the encoding. + The character_code is used as a fallback only. + */ + if ( NULL == dict_get_encoding (dict)) + { + switch (character_code) + { + case 1: + dict_set_encoding (dict, "EBCDIC-US"); + break; + case 2: + case 3: + /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + respectively. However, there are known to be many files + in the wild with character code 2, yet have data which are + clearly not ascii. + Therefore we ignore these values. + */ + return; + case 4: + dict_set_encoding (dict, "MS_KANJI"); + break; + case 65000: + dict_set_encoding (dict, "UTF-7"); + break; + case 65001: + dict_set_encoding (dict, "UTF-8"); + break; + default: + { + char enc[100]; + snprintf (enc, 100, "CP%d", character_code); + dict_set_encoding (dict, enc); + } + break; + }; + } } /* Read record type 7, subtype 4. */ @@ -1084,6 +1140,7 @@ read_value_labels (struct sfm_reader *r, struct variable **var = NULL; /* Associated variables. */ int var_cnt; /* Number of associated variables. */ + int max_width; /* Maximum width of string variables. */ int i; @@ -1142,12 +1199,15 @@ read_value_labels (struct sfm_reader *r, /* Read the list of variables. */ var = pool_nalloc (subpool, var_cnt, sizeof *var); + max_width = 0; for (i = 0; i < var_cnt; i++) { var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r)); if (var_is_long_string (var[i])) - sys_error (r, _("Value labels are not allowed on long string " - "variables (%s)."), var_get_name (var[i])); + sys_error (r, _("Value labels may not be added to long string " + "variables (e.g. %s) using records types 3 and 4."), + var_get_name (var[i])); + max_width = MAX (max_width, var_get_width (var[i])); } /* Type check the variables. */ @@ -1166,9 +1226,10 @@ read_value_labels (struct sfm_reader *r, { struct label *label = labels + i; + value_init_pool (subpool, &label->value, max_width); if (var_is_alpha (var[0])) - buf_copy_rpad (label->value.s, sizeof label->value.s, - label->raw_value, sizeof label->raw_value); + buf_copy_rpad (value_str_rw (&label->value, max_width), max_width, + label->raw_value, sizeof label->raw_value, ' '); else label->value.f = float_get_double (r->float_format, label->raw_value); } @@ -1190,7 +1251,7 @@ read_value_labels (struct sfm_reader *r, label->value.f, var_get_name (v)); else sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."), - var_get_width (v), label->value.s, + max_width, value_str (&label->value, max_width), var_get_name (v)); } } @@ -1270,6 +1331,113 @@ read_data_file_attributes (struct sfm_reader *r, close_text_record (r, text); } +static void +skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels) +{ + size_t i; + + for (i = 0; i < n_labels; i++) + { + size_t value_length, label_length; + + value_length = read_int (r); + skip_bytes (r, value_length); + label_length = read_int (r); + skip_bytes (r, label_length); + } +} + +static void +read_long_string_value_labels (struct sfm_reader *r, + size_t size, size_t count, + struct dictionary *d) +{ + const off_t start = ftello (r->file); + while (ftello (r->file) - start < size * count) + { + char var_name[VAR_NAME_LEN + 1]; + size_t n_labels, i; + struct variable *v; + union value value; + int var_name_len; + int width; + + /* Read header. */ + var_name_len = read_int (r); + if (var_name_len > VAR_NAME_LEN) + sys_error (r, _("Variable name length in long string value label " + "record (%d) exceeds %d-byte limit."), + var_name_len, VAR_NAME_LEN); + read_string (r, var_name, var_name_len + 1); + width = read_int (r); + n_labels = read_int (r); + + v = dict_lookup_var (d, var_name); + if (v == NULL) + { + sys_warn (r, _("Ignoring long string value record for " + "unknown variable %s."), var_name); + skip_long_string_value_labels (r, n_labels); + continue; + } + if (var_is_numeric (v)) + { + sys_warn (r, _("Ignoring long string value record for " + "numeric variable %s."), var_name); + skip_long_string_value_labels (r, n_labels); + continue; + } + if (width != var_get_width (v)) + { + sys_warn (r, _("Ignoring long string value record for variable %s " + "because the record's width (%d) does not match the " + "variable's width (%d)"), + var_name, width, var_get_width (v)); + skip_long_string_value_labels (r, n_labels); + continue; + } + + /* Read values. */ + value_init_pool (r->pool, &value, width); + for (i = 0; i < n_labels; i++) + { + size_t value_length, label_length; + char label[256]; + bool skip = false; + + /* Read value. */ + value_length = read_int (r); + if (value_length == width) + read_string (r, value_str_rw (&value, width), width + 1); + else + { + sys_warn (r, _("Ignoring long string value %zu for variable %s, " + "with width %d, that has bad value width %zu."), + i, var_get_name (v), width, value_length); + skip_bytes (r, value_length); + skip = true; + } + + /* Read label. */ + label_length = read_int (r); + read_string (r, label, MIN (sizeof label, label_length + 1)); + if (label_length >= sizeof label) + { + /* Skip and silently ignore label text after the + first 255 bytes. The maximum documented length + of a label is 120 bytes so this is more than + generous. */ + skip_bytes (r, sizeof label - (label_length + 1)); + } + + if (!skip && !var_add_value_label (v, &value, label)) + sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."), + width, value_str (&value, width), var_get_name (v)); + } + } +} + + /* Reads record type 7, subtype 18, which lists custom attributes on individual variables. */ static void @@ -1304,24 +1472,24 @@ static bool read_compressed_string (struct sfm_reader *, char *); static bool read_whole_strings (struct sfm_reader *, char *, size_t); static bool skip_whole_strings (struct sfm_reader *, size_t); -/* Reads one case from READER's file into C. Returns true only - if successful. */ -static bool -sys_file_casereader_read (struct casereader *reader, void *r_, - struct ccase *c) +/* Reads and returns one case from READER's file. Returns a null + pointer if not successful. */ +static struct ccase * +sys_file_casereader_read (struct casereader *reader, void *r_) { struct sfm_reader *r = r_; + struct ccase *volatile c; int i; if (r->error) - return false; + return NULL; - case_create (c, r->value_cnt); + c = case_create (r->proto); if (setjmp (r->bail_out)) { casereader_force_error (reader); - case_destroy (c); - return false; + case_unref (c); + return NULL; } for (i = 0; i < r->sfm_var_cnt; i++) @@ -1329,28 +1497,29 @@ sys_file_casereader_read (struct casereader *reader, void *r_, struct sfm_var *sv = &r->sfm_vars[i]; union value *v = case_data_rw_idx (c, sv->case_index); - if (sv->width == 0) + if (sv->var_width == 0) { if (!read_case_number (r, &v->f)) goto eof; } else { - if (!read_case_string (r, v->s + sv->offset, sv->width)) + char *s = value_str_rw (v, sv->var_width); + if (!read_case_string (r, s + sv->offset, sv->segment_width)) goto eof; if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8))) partial_record (r); } } - return true; + return c; eof: - case_destroy (c); + case_unref (c); if (i != 0) partial_record (r); if (r->case_cnt != -1) read_error (reader, r); - return false; + return NULL; } /* Issues an error that R ends in a partial record. */