X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=src%2Fdata%2Fsys-file-reader.c;h=35f40d3dbc4755f3365b8549fd2966f121f1f259;hb=8f5194875a0a3d41fef91825fd8378bb004d6f51;hp=024b4ae1827994d1d6d60a7820e4535295190f5d;hpb=dff37440177a355bfc0cf9ff56428114e29f5106;p=pspp diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 024b4ae182..35f40d3dbc 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -50,6 +50,7 @@ #include "libpspp/str.h" #include "libpspp/stringi-set.h" +#include "gl/c-strtod.h" #include "gl/c-ctype.h" #include "gl/inttostr.h" #include "gl/localcharset.h" @@ -312,15 +313,20 @@ sfm_read_info_destroy (struct sfm_read_info *info) /* Opens the system file designated by file handle FH for reading. Reads the system file's dictionary into *DICT. + Ordinarily the reader attempts to automatically detect the character + encoding based on the file's contents. This isn't always possible, + especially for files written by old versions of SPSS or PSPP, so specifying + a nonnull ENCODING overrides the choice of character encoding. + If INFO is non-null, then it receives additional info about the system file, which the caller must eventually free with sfm_read_info_destroy() when it is no longer needed. */ struct casereader * -sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, - struct sfm_read_info *infop) +sfm_open_reader (struct file_handle *fh, const char *volatile encoding, + struct dictionary **dictp, struct sfm_read_info *infop) { struct sfm_reader *volatile r = NULL; - struct sfm_read_info info; + struct sfm_read_info *volatile info; struct sfm_header_record header; @@ -334,7 +340,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, struct sfm_extension_record *extensions[32]; - struct dictionary *dict = NULL; + struct dictionary *volatile dict = NULL; size_t i; /* Create and initialize reader. */ @@ -347,7 +353,8 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, r->opcode_idx = sizeof r->opcodes; r->corruption_warning = false; - memset (&info, 0, sizeof info); + info = infop ? infop : xmalloc (sizeof *info); + memset (info, 0, sizeof *info); /* TRANSLATORS: this fragment will be interpolated into messages in fh_lock() that identify types of files. */ @@ -367,7 +374,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, goto error; /* Read header. */ - read_header (r, &info, &header); + read_header (r, info, &header); vars = NULL; n_vars = allocated_vars = 0; @@ -454,8 +461,10 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, First, figure out the correct character encoding, because this determines how the rest of the header data is to be interpreted. */ - dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER], - extensions[EXT_ENCODING])); + dict = dict_create (encoding + ? encoding + : choose_encoding (r, &header, extensions[EXT_INTEGER], + extensions[EXT_ENCODING])); r->encoding = dict_get_encoding (dict); /* These records don't use variables at all. */ @@ -463,7 +472,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, parse_document (dict, document); if (extensions[EXT_INTEGER] != NULL) - parse_machine_integer_info (r, extensions[EXT_INTEGER], &info); + parse_machine_integer_info (r, extensions[EXT_INTEGER], info); if (extensions[EXT_FLOAT] != NULL) parse_machine_float_info (r, extensions[EXT_FLOAT]); @@ -471,7 +480,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, if (extensions[EXT_FILE_ATTRS] != NULL) parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict); - parse_header (r, &header, &info, dict); + parse_header (r, &header, info, dict); /* Parse the variable records, the basis of almost everything else. */ parse_variable_records (r, dict, vars, n_vars); @@ -524,7 +533,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, wrong when very long strings are involved, so don't warn in that case. */ if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars - && info.version_major != 13) + && info->version_major != 13) sys_warn (r, -1, _("File header claims %d variable positions but " "%zu were read from file."), header.nominal_case_size, n_vars); @@ -538,10 +547,11 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool); *dictp = dict; - if (infop) - *infop = info; - else - sfm_read_info_destroy (&info); + if (infop != info) + { + sfm_read_info_destroy (info); + free (info); + } return casereader_create_sequential (NULL, r->proto, @@ -549,7 +559,12 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, &sys_file_casereader_class, r); error: - sfm_read_info_destroy (&info); + if (infop != info) + { + sfm_read_info_destroy (info); + free (info); + } + close_reader (r); dict_destroy (dict); *dictp = NULL; @@ -1414,6 +1429,7 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, _("MRSET %s has only %zu variables."), mrset->name, mrset->n_vars); mrset_destroy (mrset); + stringi_set_destroy (&var_names); continue; } @@ -1422,7 +1438,7 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, mrset->width = width; value_init (&mrset->counted, width); if (width == 0) - mrset->counted.f = strtod (counted, NULL); + mrset->counted.f = c_strtod (counted, NULL); else value_copy_str_rpad (&mrset->counted, width, (const uint8_t *) counted, ' '); @@ -1557,7 +1573,8 @@ parse_long_var_name_map (struct sfm_reader *r, if (record == NULL) { - /* Convert variable names to lowercase. */ + /* There are no long variable names. Use the short variable names, + converted to lowercase, as the long variable names. */ size_t i; for (i = 0; i < dict_get_var_cnt (dict); i++) @@ -1565,11 +1582,8 @@ parse_long_var_name_map (struct sfm_reader *r, struct variable *var = dict_get_var (dict, i); char *new_name; - new_name = xstrdup (var_get_name (var)); - str_lowercase (new_name); - + new_name = utf8_to_lower (var_get_name (var)); rename_var_and_save_short_names (dict, var, new_name); - free (new_name); } @@ -1584,7 +1598,6 @@ parse_long_var_name_map (struct sfm_reader *r, while (read_variable_to_value_pair (r, dict, text, &var, &long_name)) { /* Validate long name. */ - /* XXX need to reencode name to UTF-8 */ if (!dict_id_is_valid (dict, long_name, false)) { sys_warn (r, record->pos, @@ -1595,7 +1608,7 @@ parse_long_var_name_map (struct sfm_reader *r, } /* Identify any duplicates. */ - if (strcasecmp (var_get_short_name (var, 0), long_name) + if (utf8_strcasecmp (var_get_short_name (var, 0), long_name) && dict_lookup_var (dict, long_name) != NULL) { sys_warn (r, record->pos, @@ -2432,7 +2445,7 @@ text_parse_counted_string (struct sfm_reader *r, struct text_record *text) start = text->pos; n = 0; - for (;;) + while (text->pos < text->buffer.length) { int c = text->buffer.string[text->pos]; if (c < '0' || c > '9') @@ -2440,7 +2453,7 @@ text_parse_counted_string (struct sfm_reader *r, struct text_record *text) n = (n * 10) + (c - '0'); text->pos++; } - if (start == text->pos) + if (text->pos >= text->buffer.length || start == text->pos) { sys_warn (r, text->start, _("Expecting digit at offset %zu in MRSETS record."),