X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=d553b3a0e5796b4393a4f37d03c38057a14501c8;hb=refs%2Fbuilds%2F20131017030503%2Fpspp;hp=35f40d3dbc4755f3365b8549fd2966f121f1f259;hpb=6f3865480503c571963d8a2d1af858a4d72d4e88;p=pspp diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 35f40d3dbc..d553b3a0e5 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-2000, 2006-2007, 2009-2012 Free Software Foundation, Inc. + Copyright (C) 1997-2000, 2006-2007, 2009-2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -72,7 +72,8 @@ enum EXT_DATE = 6, /* DATE. */ EXT_MRSETS = 7, /* Multiple response sets. */ EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */ - /* subtypes 9-10 unknown */ + /* subtype 9 unknown */ + EXT_PRODUCT_INFO = 10, /* Extra product info text. */ EXT_DISPLAY = 11, /* Variable display parameters. */ /* subtype 12 unknown */ EXT_LONG_NAMES = 13, /* Long variable names. */ @@ -83,7 +84,9 @@ enum EXT_VAR_ATTRS = 18, /* Variable attributes. */ EXT_MRSETS2 = 19, /* Multiple response sets (extended). */ EXT_ENCODING = 20, /* Character encoding. */ - EXT_LONG_LABELS = 21 /* Value labels for long strings. */ + EXT_LONG_LABELS = 21, /* Value labels for long strings. */ + EXT_LONG_MISSING = 22, /* Missing values for long strings. */ + EXT_DATAVIEW = 24 /* "Format properties in dataview table". */ }; /* Fields from the top-level header record. */ @@ -139,6 +142,7 @@ struct sfm_document_record struct sfm_extension_record { + int subtype; /* Record subtype. */ off_t pos; /* Starting offset in file. */ size_t size; /* Size of data elements. */ size_t count; /* Number of data elements. */ @@ -200,6 +204,8 @@ static double read_float (struct sfm_reader *); static void read_string (struct sfm_reader *, char *, size_t); static void skip_bytes (struct sfm_reader *, size_t); +static char *fix_line_ends (const char *); + static int parse_int (struct sfm_reader *, const void *data, size_t ofs); static double parse_float (struct sfm_reader *, const void *data, size_t ofs); @@ -245,6 +251,7 @@ static bool text_read_short_name (struct sfm_reader *, struct dictionary *, static const char *text_parse_counted_string (struct sfm_reader *, struct text_record *); static size_t text_pos (const struct text_record *); +static const char *text_get_all (const struct text_record *); static bool close_reader (struct sfm_reader *r); @@ -275,6 +282,9 @@ static void parse_machine_integer_info (struct sfm_reader *, struct sfm_read_info *); static void parse_machine_float_info (struct sfm_reader *, const struct sfm_extension_record *); +static void parse_extra_product_info (struct sfm_reader *, + const struct sfm_extension_record *, + struct sfm_read_info *); static void parse_mrsets (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); @@ -294,9 +304,13 @@ static void parse_data_file_attributes (struct sfm_reader *, static void parse_variable_attributes (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); +static void assign_variable_roles (struct sfm_reader *, struct dictionary *); static void parse_long_string_value_labels (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); +static void parse_long_string_missing_values (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); /* Frees the strings inside INFO. */ void @@ -307,6 +321,7 @@ sfm_read_info_destroy (struct sfm_read_info *info) free (info->creation_date); free (info->creation_time); free (info->product); + free (info->product_ext); } } @@ -477,6 +492,9 @@ sfm_open_reader (struct file_handle *fh, const char *volatile encoding, if (extensions[EXT_FLOAT] != NULL) parse_machine_float_info (r, extensions[EXT_FLOAT]); + if (extensions[EXT_PRODUCT_INFO] != NULL) + parse_extra_product_info (r, extensions[EXT_PRODUCT_INFO], info); + if (extensions[EXT_FILE_ATTRS] != NULL) parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict); @@ -523,10 +541,17 @@ sfm_open_reader (struct file_handle *fh, const char *volatile encoding, /* The following records use long names, so they need to follow renaming. */ if (extensions[EXT_VAR_ATTRS] != NULL) - parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict); + { + parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict); + + /* Roles use the $@Role attribute. */ + assign_variable_roles (r, dict); + } if (extensions[EXT_LONG_LABELS] != NULL) parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict); + if (extensions[EXT_LONG_MISSING] != NULL) + parse_long_string_missing_values (r, extensions[EXT_LONG_MISSING], dict); /* Warn if the actual amount of data per case differs from the amount that the header claims. SPSS version 13 gets this @@ -844,6 +869,7 @@ static void read_extension_record_header (struct sfm_reader *r, int subtype, struct sfm_extension_record *record) { + record->subtype = subtype; record->pos = r->pos; record->size = read_int (r); record->count = read_int (r); @@ -873,6 +899,7 @@ read_extension_record (struct sfm_reader *r, int subtype) { EXT_INTEGER, 4, 8 }, { EXT_FLOAT, 8, 3 }, { EXT_MRSETS, 1, 0 }, + { EXT_PRODUCT_INFO, 1, 0 }, { EXT_DISPLAY, 4, 0 }, { EXT_LONG_NAMES, 1, 0 }, { EXT_LONG_STRINGS, 1, 0 }, @@ -882,11 +909,13 @@ read_extension_record (struct sfm_reader *r, int subtype) { EXT_MRSETS2, 1, 0 }, { EXT_ENCODING, 1, 0 }, { EXT_LONG_LABELS, 1, 0 }, + { EXT_LONG_MISSING, 1, 0 }, /* Ignored record types. */ { EXT_VAR_SETS, 0, 0 }, { EXT_DATE, 0, 0 }, { EXT_DATA_ENTRY, 0, 0 }, + { EXT_DATAVIEW, 0, 0 }, }; const struct extension_record_type *type; @@ -951,13 +980,16 @@ parse_header (struct sfm_reader *r, const struct sfm_header_record *header, const char *dict_encoding = dict_get_encoding (dict); struct substring product; struct substring label; + char *fixed_label; /* Convert file label to UTF-8 and put it into DICT. */ label = recode_substring_pool ("UTF-8", dict_encoding, ss_cstr (header->file_label), r->pool); ss_trim (&label, ss_cstr (" ")); label.string[label.length] = '\0'; - dict_set_label (dict, label.string); + fixed_label = fix_line_ends (label.string); + dict_set_label (dict, fixed_label); + free (fixed_label); /* Put creation date and time in UTF-8 into INFO. */ info->creation_date = recode_string ("UTF-8", dict_encoding, @@ -1040,6 +1072,11 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, { double low = parse_float (r, rec->missing, 0); double high = parse_float (r, rec->missing, 8); + + /* Deal with SPSS 21 change in representation. */ + if (low == SYSMIS) + low = LOWEST; + mv_add_range (&mv, low, high); ofs += 16; } @@ -1057,11 +1094,7 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, value_init_pool (r->pool, &value, width); value_set_missing (&value, width); for (i = 0; i < rec->missing_value_code; i++) - { - uint8_t *s = value_str_rw (&value, width); - memcpy (s, rec->missing + 8 * i, MIN (width, 8)); - mv_add_str (&mv, s); - } + mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8)); } var_set_missing_values (var, &mv); } @@ -1258,16 +1291,38 @@ parse_machine_float_info (struct sfm_reader *r, double lowest = parse_float (r, record->data, 16); if (sysmis != SYSMIS) - sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."), - sysmis, "SYSMIS"); + sys_warn (r, record->pos, + _("File specifies unexpected value %g (%a) as %s, " + "instead of %g (%a)."), + sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS); if (highest != HIGHEST) - sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."), - highest, "HIGHEST"); + sys_warn (r, record->pos, + _("File specifies unexpected value %g (%a) as %s, " + "instead of %g (%a)."), + highest, highest, "HIGHEST", HIGHEST, HIGHEST); + + /* SPSS before version 21 used a unique value just bigger than SYSMIS as + LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only + appears in a context (missing values) where SYSMIS cannot. */ + if (lowest != LOWEST && lowest != SYSMIS) + sys_warn (r, record->pos, + _("File specifies unexpected value %g (%a) as %s, " + "instead of %g (%a) or %g (%a)."), + lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS); +} - if (lowest != LOWEST) - sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."), - lowest, "LOWEST"); +/* Parses record type 7, subtype 10. */ +static void +parse_extra_product_info (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct sfm_read_info *info) +{ + struct text_record *text; + + text = open_text_record (r, record, true); + info->product_ext = fix_line_ends (text_get_all (text)); + close_text_record (r, text); } /* Parses record type 7, subtype 7 or 19. */ @@ -1498,9 +1553,8 @@ parse_display_parameters (struct sfm_reader *r, align = parse_int (r, record->data, ofs); ofs += 4; - /* SPSS 14 sometimes seems to set string variables' measure - to zero. */ - if (0 == measure && var_is_alpha (v)) + /* SPSS sometimes seems to set variables' measure to zero. */ + if (0 == measure) measure = 1; if (measure < 1 || measure > 3 || align < 0 || align > 2) @@ -1698,7 +1752,7 @@ parse_value_labels (struct sfm_reader *r, struct dictionary *dict, char **utf8_labels; size_t i; - utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels); + utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels); for (i = 0; i < record->n_labels; i++) utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict), record->labels[i].label, -1, @@ -1881,6 +1935,64 @@ parse_variable_attributes (struct sfm_reader *r, close_text_record (r, text); } +static void +assign_variable_roles (struct sfm_reader *r, struct dictionary *dict) +{ + size_t n_warnings = 0; + size_t i; + + for (i = 0; i < dict_get_var_cnt (dict); i++) + { + struct variable *var = dict_get_var (dict, i); + struct attrset *attrs = var_get_attributes (var); + const struct attribute *attr = attrset_lookup (attrs, "$@Role"); + if (attr != NULL) + { + int value = atoi (attribute_get_value (attr, 0)); + enum var_role role; + + switch (value) + { + case 0: + role = ROLE_INPUT; + break; + + case 1: + role = ROLE_TARGET; + break; + + case 2: + role = ROLE_BOTH; + break; + + case 3: + role = ROLE_NONE; + break; + + case 4: + role = ROLE_PARTITION; + break; + + case 5: + role = ROLE_SPLIT; + break; + + default: + role = ROLE_INPUT; + if (n_warnings++ == 0) + sys_warn (r, -1, _("Invalid role for variable %s."), + var_get_name (var)); + } + + var_set_role (var, role); + } + } + + if (n_warnings > 1) + sys_warn (r, -1, _("%zu other variables had invalid roles."), + n_warnings - 1); +} + static void check_overflow (struct sfm_reader *r, const struct sfm_extension_record *record, @@ -1889,7 +2001,8 @@ check_overflow (struct sfm_reader *r, size_t end = record->size * record->count; if (length >= end || ofs + length > end) sys_error (r, record->pos + end, - _("Long string value label record ends unexpectedly.")); + _("Extension record subtype %d ends unexpectedly."), + record->subtype); } static void @@ -1928,20 +2041,20 @@ parse_long_string_value_labels (struct sfm_reader *r, var = dict_lookup_var (dict, var_name); if (var == NULL) sys_warn (r, record->pos + ofs, - _("Ignoring long string value record for " + _("Ignoring long string value label record for " "unknown variable %s."), var_name); else if (var_is_numeric (var)) { sys_warn (r, record->pos + ofs, - _("Ignoring long string value record for " + _("Ignoring long string value label record for " "numeric variable %s."), var_name); var = NULL; } else if (width != var_get_width (var)) { sys_warn (r, record->pos + ofs, - _("Ignoring long string value record for variable %s " - "because the record's width (%d) does not match the " + _("Ignoring long string value label record for variable " + "%s because the record's width (%d) does not match the " "variable's width (%d)."), var_name, width, var_get_width (var)); var = NULL; @@ -1969,8 +2082,8 @@ parse_long_string_value_labels (struct sfm_reader *r, else { sys_warn (r, record->pos + ofs, - _("Ignoring long string value %zu for variable " - "%s, with width %d, that has bad value " + _("Ignoring long string value label %zu for " + "variable %s, with width %d, that has bad value " "width %zu."), i, var_get_name (var), width, value_length); skip = true; @@ -2003,6 +2116,89 @@ parse_long_string_value_labels (struct sfm_reader *r, } } } + +static void +parse_long_string_missing_values (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + const char *dict_encoding = dict_get_encoding (dict); + size_t end = record->size * record->count; + size_t ofs = 0; + + while (ofs < end) + { + struct missing_values mv; + char *var_name; + struct variable *var; + int n_missing_values; + int var_name_len; + size_t i; + + /* Parse variable name length. */ + check_overflow (r, record, ofs, 4); + var_name_len = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse variable name. */ + check_overflow (r, record, ofs, var_name_len + 1); + var_name = recode_string_pool ("UTF-8", dict_encoding, + (const char *) record->data + ofs, + var_name_len, r->pool); + ofs += var_name_len; + + /* Parse number of missing values. */ + n_missing_values = ((const uint8_t *) record->data)[ofs]; + if (n_missing_values < 1 || n_missing_values > 3) + sys_warn (r, record->pos + ofs, + _("Long string missing values record says variable %s " + "has %d missing values, but only 1 to 3 missing values " + "are allowed."), + var_name, n_missing_values); + ofs++; + + /* Look up 'var' and validate. */ + var = dict_lookup_var (dict, var_name); + if (var == NULL) + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value record for " + "unknown variable %s."), var_name); + else if (var_is_numeric (var)) + { + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value record for " + "numeric variable %s."), var_name); + var = NULL; + } + + /* Parse values. */ + mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8); + for (i = 0; i < n_missing_values; i++) + { + size_t value_length; + + /* Parse value length. */ + check_overflow (r, record, ofs, 4); + value_length = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse value. */ + check_overflow (r, record, ofs, value_length); + if (var != NULL + && i < 3 + && !mv_add_str (&mv, (const uint8_t *) record->data + ofs, + value_length)) + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value %zu for variable " + "%s, with width %d, that has bad value width %zu."), + i, var_get_name (var), var_get_width (var), + value_length); + ofs += value_length; + } + if (var != NULL) + var_set_missing_values (var, &mv); + } +} /* Case reader. */ @@ -2510,6 +2706,12 @@ text_pos (const struct text_record *text) { return text->pos; } + +static const char * +text_get_all (const struct text_record *text) +{ + return text->buffer.string; +} /* Messages. */ @@ -2662,6 +2864,35 @@ skip_bytes (struct sfm_reader *r, size_t bytes) bytes -= chunk; } } + +/* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have + been replaced by LFs. + + (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system + files that use CR-only line ends in the file label and extra product + info.) */ +static char * +fix_line_ends (const char *s) +{ + char *dst, *d; + + d = dst = xmalloc (strlen (s) + 1); + while (*s != '\0') + { + if (*s == '\r') + { + s++; + if (*s == '\n') + s++; + *d++ = '\n'; + } + else + *d++ = *s++; + } + *d = '\0'; + + return dst; +} static const struct casereader_class sys_file_casereader_class = {