+ if (memcmp (raw_bias, zero_bias, 8))
+ sys_warn (r, r->pos - 8,
+ _("Compression bias is not the usual "
+ "value of 100, or system file uses unrecognized "
+ "floating-point format."));
+ else
+ {
+ /* Some software is known to write all-zeros to this
+ field. Such software also writes floating-point
+ numbers in the format that we expect by default
+ (it seems that all software most likely does, in
+ reality), so don't warn in this case. */
+ }
+
+ if (r->integer_format == INTEGER_MSB_FIRST)
+ r->float_format = FLOAT_IEEE_DOUBLE_BE;
+ else
+ r->float_format = FLOAT_IEEE_DOUBLE_LE;
+ }
+ float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
+
+ if (!read_string (r, header->creation_date, sizeof header->creation_date)
+ || !read_string (r, header->creation_time, sizeof header->creation_time)
+ || !read_string (r, header->file_label, sizeof header->file_label)
+ || !skip_bytes (r, 3))
+ return false;
+
+ info->integer_format = r->integer_format;
+ info->float_format = r->float_format;
+ info->compression = r->compression;
+ info->case_cnt = r->case_cnt;
+
+ return true;
+}
+
+/* Reads a variable (type 2) record from R into RECORD. */
+static bool
+read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
+{
+ int has_variable_label;
+
+ memset (record, 0, sizeof *record);
+
+ record->pos = r->pos;
+ if (!read_int (r, &record->width)
+ || !read_int (r, &has_variable_label)
+ || !read_int (r, &record->missing_value_code)
+ || !read_int (r, &record->print_format)
+ || !read_int (r, &record->write_format)
+ || !read_string (r, record->name, sizeof record->name))
+ return false;
+
+ if (has_variable_label == 1)
+ {
+ enum { MAX_LABEL_LEN = 65536 };
+ unsigned int len, read_len;
+
+ if (!read_uint (r, &len))
+ return false;
+
+ /* Read up to MAX_LABEL_LEN bytes of label. */
+ read_len = MIN (MAX_LABEL_LEN, len);
+ record->label = pool_malloc (r->pool, read_len + 1);
+ if (!read_string (r, record->label, read_len + 1))
+ return false;
+
+ /* Skip unread label bytes. */
+ if (!skip_bytes (r, len - read_len))
+ return false;
+
+ /* Skip label padding up to multiple of 4 bytes. */
+ if (!skip_bytes (r, ROUND_UP (len, 4) - len))
+ return false;
+ }
+ else if (has_variable_label != 0)
+ {
+ sys_error (r, record->pos,
+ _("Variable label indicator field is not 0 or 1."));
+ return false;
+ }
+
+ /* Set missing values. */
+ if (record->missing_value_code != 0)
+ {
+ int code = record->missing_value_code;
+ if (record->width == 0)
+ {
+ if (code < -3 || code > 3 || code == -1)
+ {
+ sys_error (r, record->pos,
+ _("Numeric missing value indicator field is not "
+ "-3, -2, 0, 1, 2, or 3."));
+ return false;
+ }
+ }
+ else
+ {
+ if (code < 1 || code > 3)
+ {
+ sys_error (r, record->pos,
+ _("String missing value indicator field is not "
+ "0, 1, 2, or 3."));
+ return false;
+ }
+ }
+
+ if (!read_bytes (r, record->missing, 8 * abs (code)))
+ return false;
+ }
+
+ return true;
+}
+
+/* Reads value labels from R into RECORD. */
+static bool
+read_value_label_record (struct sfm_reader *r,
+ struct sfm_value_label_record *record)
+{
+ size_t i;
+ int type;
+
+ /* Read type 3 record. */
+ record->pos = r->pos;
+ if (!read_uint (r, &record->n_labels))
+ return false;
+ if (record->n_labels > UINT_MAX / sizeof *record->labels)
+ {
+ sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
+ record->n_labels);
+ return false;
+ }
+ record->labels = pool_nmalloc (r->pool, record->n_labels,
+ sizeof *record->labels);
+ for (i = 0; i < record->n_labels; i++)
+ {
+ struct sfm_value_label *label = &record->labels[i];
+ unsigned char label_len;
+ size_t padded_len;
+
+ if (!read_bytes (r, label->value, sizeof label->value))
+ return false;
+
+ /* Read label length. */
+ if (!read_bytes (r, &label_len, sizeof label_len))
+ return false;
+ padded_len = ROUND_UP (label_len + 1, 8);
+
+ /* Read label, padding. */
+ label->label = pool_malloc (r->pool, padded_len + 1);
+ if (!read_bytes (r, label->label, padded_len - 1))
+ return false;
+ label->label[label_len] = '\0';
+ }
+
+ /* Read record type of type 4 record. */
+ if (!read_int (r, &type))
+ return false;
+ if (type != 4)
+ {
+ sys_error (r, r->pos - 4,
+ _("Variable index record (type 4) does not immediately "
+ "follow value label record (type 3) as it should."));
+ return false;
+ }
+
+ /* Read number of variables associated with value label from type 4
+ record. */
+ if (!read_uint (r, &record->n_vars))
+ return false;
+ if (record->n_vars < 1 || record->n_vars > r->n_vars)
+ {
+ sys_error (r, r->pos - 4,
+ _("Number of variables associated with a value label (%u) "
+ "is not between 1 and the number of variables (%zu)."),
+ record->n_vars, r->n_vars);
+ return false;
+ }
+
+ record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
+ for (i = 0; i < record->n_vars; i++)
+ if (!read_int (r, &record->vars[i]))
+ return false;
+
+ return true;
+}
+
+/* Reads a document record from R. Returns true if successful, false on
+ error. */
+static bool
+read_document_record (struct sfm_reader *r)
+{
+ int n_lines;
+ if (!read_int (r, &n_lines))
+ return false;
+ else if (n_lines == 0)
+ return true;
+ else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
+ {
+ sys_error (r, r->pos,
+ _("Number of document lines (%d) "
+ "must be greater than 0 and less than %d."),
+ n_lines, INT_MAX / DOC_LINE_LENGTH);
+ return false;
+ }
+
+ struct sfm_document_record *record;
+ record = pool_malloc (r->pool, sizeof *record);
+ record->pos = r->pos;
+ record->n_lines = n_lines;
+ record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
+ if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
+ return false;
+
+ r->document = record;
+ return true;
+}
+
+static bool
+read_extension_record_header (struct sfm_reader *r, int subtype,
+ struct sfm_extension_record *record)
+{
+ record->subtype = subtype;
+ record->pos = r->pos;
+ if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
+ return false;
+
+ /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
+ allows an extra byte for a null terminator, used by some
+ extension processing routines. */
+ if (record->size != 0
+ && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
+ {
+ sys_error (r, record->pos, "Record type 7 subtype %d too large.",
+ subtype);
+ return false;
+ }
+
+ return true;
+}
+
+/* Reads an extension record from R into RECORD. */
+static bool
+read_extension_record (struct sfm_reader *r, int subtype,
+ struct sfm_extension_record **recordp)
+{
+ struct extension_record_type
+ {
+ int subtype;
+ int size;
+ int count;
+ };
+
+ static const struct extension_record_type types[] =
+ {
+ /* Implemented record types. */
+ { EXT_INTEGER, 4, 8 },
+ { EXT_FLOAT, 8, 3 },
+ { EXT_MRSETS, 1, 0 },
+ { EXT_PRODUCT_INFO, 1, 0 },
+ { EXT_DISPLAY, 4, 0 },
+ { EXT_LONG_NAMES, 1, 0 },
+ { EXT_LONG_STRINGS, 1, 0 },
+ { EXT_NCASES, 8, 2 },
+ { EXT_FILE_ATTRS, 1, 0 },
+ { EXT_VAR_ATTRS, 1, 0 },
+ { EXT_MRSETS2, 1, 0 },
+ { EXT_ENCODING, 1, 0 },
+ { EXT_LONG_LABELS, 1, 0 },
+ { EXT_LONG_MISSING, 1, 0 },
+
+ /* Ignored record types. */
+ { EXT_VAR_SETS, 0, 0 },
+ { EXT_DATE, 0, 0 },
+ { EXT_DATA_ENTRY, 0, 0 },
+ { EXT_DATAVIEW, 0, 0 },
+ };
+
+ const struct extension_record_type *type;
+ struct sfm_extension_record *record;
+ size_t n_bytes;
+
+ *recordp = NULL;
+ record = pool_malloc (r->pool, sizeof *record);
+ if (!read_extension_record_header (r, subtype, record))
+ return false;
+ n_bytes = record->count * record->size;
+
+ for (type = types; type < &types[sizeof types / sizeof *types]; type++)
+ if (subtype == type->subtype)
+ {
+ if (type->size > 0 && record->size != type->size)
+ sys_warn (r, record->pos,
+ _("Record type 7, subtype %d has bad size %u "
+ "(expected %d)."), subtype, record->size, type->size);
+ else if (type->count > 0 && record->count != type->count)
+ sys_warn (r, record->pos,
+ _("Record type 7, subtype %d has bad count %u "
+ "(expected %d)."), subtype, record->count, type->count);
+ else if (type->count == 0 && type->size == 0)
+ {
+ /* Ignore this record. */
+ }
+ else
+ {
+ char *data = pool_malloc (r->pool, n_bytes + 1);
+ data[n_bytes] = '\0';
+
+ record->data = data;
+ if (!read_bytes (r, record->data, n_bytes))
+ return false;
+ *recordp = record;
+ return true;
+ }
+
+ goto skip;
+ }
+
+ sys_warn (r, record->pos,
+ _("Unrecognized record type 7, subtype %d. For help, please "
+ "send this file to %s and mention that you were using %s."),
+ subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
+
+skip:
+ return skip_bytes (r, n_bytes);
+}
+
+static bool
+skip_extension_record (struct sfm_reader *r, int subtype)
+{
+ struct sfm_extension_record record;
+
+ return (read_extension_record_header (r, subtype, &record)
+ && skip_bytes (r, record.count * record.size));
+}
+
+static void
+parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
+ struct any_read_info *info, struct dictionary *dict)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ struct substring product;
+ struct substring label;
+ char *fixed_label;
+
+ /* Convert file label to UTF-8 and put it into DICT. */
+ label = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->file_label), r->pool);
+ ss_trim (&label, ss_cstr (" "));
+ label.string[label.length] = '\0';
+ fixed_label = fix_line_ends (label.string);
+ dict_set_label (dict, fixed_label);
+ free (fixed_label);
+
+ /* Put creation date and time in UTF-8 into INFO. */
+ info->creation_date = recode_string ("UTF-8", dict_encoding,
+ header->creation_date, -1);
+ info->creation_time = recode_string ("UTF-8", dict_encoding,
+ header->creation_time, -1);
+
+ /* Put product name into INFO, dropping eye-catcher string if present. */
+ product = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->eye_catcher), r->pool);
+ ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
+ ss_trim (&product, ss_cstr (" "));
+ info->product = ss_xstrdup (product);
+}
+
+/* Reads a variable (type 2) record from R and adds the
+ corresponding variable to DICT.
+ Also skips past additional variable records for long string
+ variables. */
+static bool
+parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
+ struct sfm_var_record *var_recs, size_t n_var_recs)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ struct sfm_var_record *rec;
+ int n_warnings = 0;
+
+ for (rec = var_recs; rec < &var_recs[n_var_recs]; )
+ {
+ struct variable *var;
+ size_t n_values;
+ char *name;
+ size_t i;
+
+ name = recode_string_pool ("UTF-8", dict_encoding,
+ rec->name, -1, r->pool);
+ name[strcspn (name, " ")] = '\0';
+
+ if (!dict_id_is_valid (dict, name, false)
+ || name[0] == '$' || name[0] == '#')
+ {
+ sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
+ return false;
+ }
+
+ if (rec->width < 0 || rec->width > 255)
+ {
+ sys_error (r, rec->pos,
+ _("Bad width %d for variable %s."), rec->width, name);
+ return false;
+ }
+
+ var = rec->var = dict_create_var (dict, name, rec->width);
+ if (var == NULL)
+ {
+ char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
+ sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
+ "`%s' to `%s'."),
+ name, new_name);
+ var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
+ free (new_name);
+ }
+
+ /* Set the short name the same as the long name. */
+ var_set_short_name (var, 0, name);
+
+ /* Get variable label, if any. */
+ if (rec->label)
+ {
+ char *utf8_label;
+
+ utf8_label = recode_string_pool ("UTF-8", dict_encoding,
+ rec->label, -1, r->pool);
+ var_set_label (var, utf8_label);
+ }
+
+ /* Set missing values. */
+ if (rec->missing_value_code != 0)
+ {
+ int width = var_get_width (var);
+ struct missing_values mv;
+
+ mv_init_pool (r->pool, &mv, width);
+ if (var_is_numeric (var))
+ {
+ bool has_range = rec->missing_value_code < 0;
+ int n_discrete = (has_range
+ ? rec->missing_value_code == -3
+ : rec->missing_value_code);
+ int ofs = 0;
+
+ if (has_range)
+ {
+ double low = parse_float (r, rec->missing, 0);
+ double high = parse_float (r, rec->missing, 8);
+
+ /* Deal with SPSS 21 change in representation. */
+ if (low == SYSMIS)
+ low = LOWEST;
+
+ mv_add_range (&mv, low, high);
+ ofs += 16;
+ }
+
+ for (i = 0; i < n_discrete; i++)
+ {
+ mv_add_num (&mv, parse_float (r, rec->missing, ofs));
+ ofs += 8;
+ }
+ }
+ else
+ for (i = 0; i < rec->missing_value_code; i++)
+ mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
+ var_set_missing_values (var, &mv);
+ }
+
+ /* Set formats. */
+ parse_format_spec (r, rec->pos + 12, rec->print_format,
+ PRINT_FORMAT, var, &n_warnings);
+ parse_format_spec (r, rec->pos + 16, rec->write_format,
+ WRITE_FORMAT, var, &n_warnings);
+
+ /* Account for values.
+ Skip long string continuation records, if any. */
+ n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
+ for (i = 1; i < n_values; i++)
+ if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
+ {
+ sys_error (r, rec->pos, _("Missing string continuation record."));
+ return false;
+ }
+ rec += n_values;
+ }
+
+ return true;
+}
+
+/* Translates the format spec from sysfile format to internal
+ format. */
+static void
+parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
+ enum which_format which, struct variable *v,
+ int *n_warnings)
+{
+ const int max_warnings = 8;
+ uint8_t raw_type = format >> 16;
+ uint8_t w = format >> 8;
+ uint8_t d = format;
+ struct fmt_spec f;
+ bool ok;
+
+ f.w = w;
+ f.d = d;
+
+ msg_disable ();
+ ok = (fmt_from_io (raw_type, &f.type)
+ && fmt_check_output (&f)
+ && fmt_check_width_compat (&f, var_get_width (v)));
+ msg_enable ();
+
+ if (ok)
+ {
+ if (which == PRINT_FORMAT)
+ var_set_print_format (v, &f);
+ else
+ var_set_write_format (v, &f);
+ }
+ else if (format == 0)
+ {
+ /* Actually observed in the wild. No point in warning about it. */
+ }
+ else if (++*n_warnings <= max_warnings)
+ {
+ if (which == PRINT_FORMAT)
+ sys_warn (r, pos, _("Variable %s with width %d has invalid print "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
+ else
+ sys_warn (r, pos, _("Variable %s with width %d has invalid write "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
+
+ if (*n_warnings == max_warnings)
+ sys_warn (r, -1, _("Suppressing further invalid format warnings."));
+ }
+}
+
+static void
+parse_document (struct dictionary *dict, struct sfm_document_record *record)
+{
+ const char *p;
+
+ for (p = record->documents;
+ p < record->documents + DOC_LINE_LENGTH * record->n_lines;
+ p += DOC_LINE_LENGTH)
+ {
+ struct substring line;
+
+ line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
+ ss_buffer (p, DOC_LINE_LENGTH), NULL);
+ ss_rtrim (&line, ss_cstr (" "));
+ line.string[line.length] = '\0';
+
+ dict_add_document_line (dict, line.string, false);
+
+ ss_dealloc (&line);
+ }
+}
+
+/* Parses record type 7, subtype 3. */
+static bool
+parse_machine_integer_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct any_read_info *info)
+{
+ int float_representation, expected_float_format;
+ int integer_representation, expected_integer_format;
+
+ /* Save version info. */
+ info->version_major = parse_int (r, record->data, 0);
+ info->version_minor = parse_int (r, record->data, 4);
+ info->version_revision = parse_int (r, record->data, 8);
+
+ /* Check floating point format. */
+ float_representation = parse_int (r, record->data, 16);
+ if (r->float_format == FLOAT_IEEE_DOUBLE_BE
+ || r->float_format == FLOAT_IEEE_DOUBLE_LE)
+ expected_float_format = 1;
+ else if (r->float_format == FLOAT_Z_LONG)
+ expected_float_format = 2;
+ else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
+ expected_float_format = 3;
+ else
+ NOT_REACHED ();
+ if (float_representation != expected_float_format)
+ {
+ sys_error (r, record->pos,
+ _("Floating-point representation indicated by "
+ "system file (%d) differs from expected (%d)."),
+ float_representation, expected_float_format);
+ return false;
+ }
+
+ /* Check integer format. */
+ integer_representation = parse_int (r, record->data, 24);
+ if (r->integer_format == INTEGER_MSB_FIRST)
+ expected_integer_format = 1;
+ else if (r->integer_format == INTEGER_LSB_FIRST)
+ expected_integer_format = 2;
+ else
+ NOT_REACHED ();
+ if (integer_representation != expected_integer_format)
+ sys_warn (r, record->pos,
+ _("Integer format indicated by system file (%d) "
+ "differs from expected (%d)."),
+ integer_representation, expected_integer_format);
+
+ return true;
+}
+
+/* Parses record type 7, subtype 4. */
+static void
+parse_machine_float_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record)
+{
+ double sysmis = parse_float (r, record->data, 0);
+ double highest = parse_float (r, record->data, 8);
+ double lowest = parse_float (r, record->data, 16);
+
+ if (sysmis != SYSMIS)
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a)."),
+ sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
+
+ if (highest != HIGHEST)
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a)."),
+ highest, highest, "HIGHEST", HIGHEST, HIGHEST);
+
+ /* SPSS before version 21 used a unique value just bigger than SYSMIS as
+ LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
+ appears in a context (missing values) where SYSMIS cannot. */
+ if (lowest != LOWEST && lowest != SYSMIS)
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a) or %g (%a)."),
+ lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
+}
+
+/* Parses record type 7, subtype 10. */
+static void
+parse_extra_product_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct any_read_info *info)
+{
+ struct text_record *text;
+
+ text = open_text_record (r, record, true);
+ info->product_ext = fix_line_ends (text_get_all (text));
+ close_text_record (r, text);
+}
+
+/* Parses record type 7, subtype 7 or 19. */
+static void
+parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
+ size_t *allocated_mrsets)
+{
+ struct text_record *text;
+
+ text = open_text_record (r, record, false);
+ for (;;)
+ {
+ struct sfm_mrset *mrset;
+ size_t allocated_vars;
+ char delimiter;
+
+ /* Skip extra line feeds if present. */
+ while (text_match (text, '\n'))
+ continue;
+
+ if (r->n_mrsets >= *allocated_mrsets)
+ r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
+ sizeof *r->mrsets);
+ mrset = &r->mrsets[r->n_mrsets];
+ memset(mrset, 0, sizeof *mrset);
+
+ mrset->name = text_get_token (text, ss_cstr ("="), NULL);
+ if (mrset->name == NULL)
+ break;
+
+ if (text_match (text, 'C'))
+ {
+ mrset->type = MRSET_MC;
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, record->pos,
+ _("Missing space following `%c' at offset %zu "
+ "in MRSETS record."), 'C', text_pos (text));
+ break;
+ }
+ }
+ else if (text_match (text, 'D'))
+ {
+ mrset->type = MRSET_MD;
+ mrset->cat_source = MRSET_VARLABELS;
+ }
+ else if (text_match (text, 'E'))
+ {
+ char *number;
+
+ mrset->type = MRSET_MD;
+ mrset->cat_source = MRSET_COUNTEDVALUES;
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, record->pos,
+ _("Missing space following `%c' at offset %zu "
+ "in MRSETS record."), 'E', text_pos (text));
+ break;
+ }
+
+ number = text_get_token (text, ss_cstr (" "), NULL);
+ if (!strcmp (number, "11"))
+ mrset->label_from_var_label = true;
+ else if (strcmp (number, "1"))
+ sys_warn (r, record->pos,
+ _("Unexpected label source value following `E' "
+ "at offset %zu in MRSETS record."),
+ text_pos (text));
+ }
+ else
+ {
+ sys_warn (r, record->pos,
+ _("Missing `C', `D', or `E' at offset %zu "
+ "in MRSETS record."),
+ text_pos (text));
+ break;
+ }
+
+ if (mrset->type == MRSET_MD)
+ {
+ mrset->counted = text_parse_counted_string (r, text);
+ if (mrset->counted == NULL)
+ break;
+ }
+
+ mrset->label = text_parse_counted_string (r, text);
+ if (mrset->label == NULL)
+ break;
+
+ allocated_vars = 0;
+ do
+ {
+ const char *var;
+
+ var = text_get_token (text, ss_cstr (" \n"), &delimiter);
+ if (var == NULL)
+ {
+ if (delimiter != '\n')
+ sys_warn (r, record->pos,
+ _("Missing new-line parsing variable names "
+ "at offset %zu in MRSETS record."),
+ text_pos (text));
+ break;
+ }
+
+ if (mrset->n_vars >= allocated_vars)
+ mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
+ &allocated_vars,
+ sizeof *mrset->vars);
+ mrset->vars[mrset->n_vars++] = var;
+ }
+ while (delimiter != '\n');
+
+ r->n_mrsets++;
+ }
+ close_text_record (r, text);
+}
+
+static void
+decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
+{
+ const struct sfm_mrset *s;
+
+ for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
+ {
+ struct stringi_set var_names;
+ struct mrset *mrset;
+ char *name;
+ int width;
+ size_t i;
+
+ name = recode_string ("UTF-8", r->encoding, s->name, -1);
+ if (name[0] != '$')
+ {
+ sys_warn (r, -1, _("Multiple response set name `%s' does not begin "
+ "with `$'."),
+ name);
+ free (name);
+ continue;
+ }
+
+ mrset = xzalloc (sizeof *mrset);
+ mrset->name = name;
+ mrset->type = s->type;
+ mrset->cat_source = s->cat_source;
+ mrset->label_from_var_label = s->label_from_var_label;
+ if (s->label[0] != '\0')
+ mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
+
+ stringi_set_init (&var_names);
+ mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
+ width = INT_MAX;
+ for (i = 0; i < s->n_vars; i++)
+ {
+ struct variable *var;
+ char *var_name;
+
+ var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
+
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ {
+ free (var_name);
+ continue;
+ }
+ if (!stringi_set_insert (&var_names, var_name))
+ {
+ sys_warn (r, -1,
+ _("MRSET %s contains duplicate variable name %s."),
+ mrset->name, var_name);
+ free (var_name);
+ continue;
+ }
+ free (var_name);
+
+ if (mrset->label == NULL && mrset->label_from_var_label
+ && var_has_label (var))
+ mrset->label = xstrdup (var_get_label (var));
+
+ if (mrset->n_vars
+ && var_get_type (var) != var_get_type (mrset->vars[0]))
+ {
+ sys_warn (r, -1,
+ _("MRSET %s contains both string and "
+ "numeric variables."), mrset->name);
+ continue;
+ }
+ width = MIN (width, var_get_width (var));
+
+ mrset->vars[mrset->n_vars++] = var;
+ }
+
+ if (mrset->n_vars < 2)
+ {
+ if (mrset->n_vars == 0)
+ sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
+ else
+ sys_warn (r, -1, _("MRSET %s has only one variable."),
+ mrset->name);
+ mrset_destroy (mrset);
+ stringi_set_destroy (&var_names);
+ continue;
+ }
+
+ if (mrset->type == MRSET_MD)
+ {
+ mrset->width = width;
+ value_init (&mrset->counted, width);
+ if (width == 0)
+ mrset->counted.f = c_strtod (s->counted, NULL);
+ else
+ value_copy_str_rpad (&mrset->counted, width,
+ (const uint8_t *) s->counted, ' ');
+ }
+
+ dict_add_mrset (dict, mrset);
+ stringi_set_destroy (&var_names);
+ }
+}
+
+/* Read record type 7, subtype 11, which specifies how variables
+ should be displayed in GUI environments. */
+static void
+parse_display_parameters (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ bool includes_width;
+ bool warned = false;
+ size_t n_vars;
+ size_t ofs;
+ size_t i;
+
+ n_vars = dict_get_var_cnt (dict);
+ if (record->count == 3 * n_vars)
+ includes_width = true;
+ else if (record->count == 2 * n_vars)
+ includes_width = false;
+ else
+ {
+ sys_warn (r, record->pos,
+ _("Extension 11 has bad count %u (for %zu variables)."),
+ record->count, n_vars);
+ return;
+ }
+
+ ofs = 0;
+ for (i = 0; i < n_vars; ++i)
+ {
+ struct variable *v = dict_get_var (dict, i);
+ int measure, width, align;
+
+ measure = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ if (includes_width)
+ {
+ width = parse_int (r, record->data, ofs);
+ ofs += 4;
+ }
+ else
+ width = 0;
+
+ align = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* SPSS sometimes seems to set variables' measure to zero. */
+ if (0 == measure)
+ measure = 1;
+
+ if (measure < 1 || measure > 3 || align < 0 || align > 2)
+ {
+ if (!warned)
+ sys_warn (r, record->pos,
+ _("Invalid variable display parameters for variable "
+ "%zu (%s). Default parameters substituted."),
+ i, var_get_name (v));
+ warned = true;
+ continue;
+ }
+
+ var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
+ : measure == 2 ? MEASURE_ORDINAL
+ : MEASURE_SCALE));
+ var_set_alignment (v, (align == 0 ? ALIGN_LEFT
+ : align == 1 ? ALIGN_RIGHT
+ : ALIGN_CENTRE));
+
+ /* Older versions (SPSS 9.0) sometimes set the display
+ width to zero. This causes confusion in the GUI, so
+ only set the width if it is nonzero. */
+ if (width > 0)
+ var_set_display_width (v, width);
+ }
+}
+
+static void
+rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
+ const char *new_name)
+{
+ size_t n_short_names;
+ char **short_names;
+ size_t i;
+
+ /* Renaming a variable may clear its short names, but we
+ want to retain them, so we save them and re-set them
+ afterward. */
+ n_short_names = var_get_short_name_cnt (var);
+ short_names = xnmalloc (n_short_names, sizeof *short_names);
+ for (i = 0; i < n_short_names; i++)
+ {
+ const char *s = var_get_short_name (var, i);
+ short_names[i] = s != NULL ? xstrdup (s) : NULL;
+ }
+
+ /* Set long name. */
+ dict_rename_var (dict, var, new_name);
+
+ /* Restore short names. */
+ for (i = 0; i < n_short_names; i++)
+ {
+ var_set_short_name (var, i, short_names[i]);
+ free (short_names[i]);
+ }
+ free (short_names);
+}
+
+/* Parses record type 7, subtype 13, which gives the long name that corresponds
+ to each short name. Modifies variable names in DICT accordingly. */
+static void
+parse_long_var_name_map (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct variable *var;
+ char *long_name;
+
+ if (record == NULL)
+ {
+ /* There are no long variable names. Use the short variable names,
+ converted to lowercase, as the long variable names. */
+ size_t i;
+
+ for (i = 0; i < dict_get_var_cnt (dict); i++)
+ {
+ struct variable *var = dict_get_var (dict, i);
+ char *new_name;
+
+ new_name = utf8_to_lower (var_get_name (var));
+ rename_var_and_save_short_names (dict, var, new_name);
+ free (new_name);
+ }
+
+ return;
+ }
+
+ /* Rename each of the variables, one by one. (In a correctly constructed
+ system file, this cannot create any intermediate duplicate variable names,
+ because all of the new variable names are longer than any of the old
+ variable names and thus there cannot be any overlaps.) */
+ text = open_text_record (r, record, true);
+ while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
+ {
+ /* Validate long name. */
+ if (!dict_id_is_valid (dict, long_name, false)
+ || long_name[0] == '$' || long_name[0] == '#')
+ {
+ sys_warn (r, record->pos,
+ _("Long variable mapping from %s to invalid "
+ "variable name `%s'."),
+ var_get_name (var), long_name);
+ continue;
+ }
+
+ /* Identify any duplicates. */
+ if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
+ && dict_lookup_var (dict, long_name) != NULL)
+ {
+ sys_warn (r, record->pos,
+ _("Duplicate long variable name `%s'."), long_name);
+ continue;
+ }
+
+ rename_var_and_save_short_names (dict, var, long_name);
+ }
+ close_text_record (r, text);
+}
+
+/* Reads record type 7, subtype 14, which gives the real length
+ of each very long string. Rearranges DICT accordingly. */
+static bool
+parse_long_string_map (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct variable *var;
+ char *length_s;
+
+ text = open_text_record (r, record, true);
+ while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
+ {
+ size_t idx = var_get_dict_index (var);
+ long int length;
+ int segment_cnt;
+ int i;
+
+ /* Get length. */
+ length = strtol (length_s, NULL, 10);
+ if (length < 1 || length > MAX_STRING)
+ {
+ sys_warn (r, record->pos,
+ _("%s listed as string of invalid length %s "
+ "in very long string record."),
+ var_get_name (var), length_s);
+ continue;
+ }
+
+ /* Check segments. */
+ segment_cnt = sfm_width_to_segments (length);
+ if (segment_cnt == 1)
+ {
+ sys_warn (r, record->pos,
+ _("%s listed in very long string record with width %s, "
+ "which requires only one segment."),
+ var_get_name (var), length_s);
+ continue;
+ }
+ if (idx + segment_cnt > dict_get_var_cnt (dict))
+ {
+ sys_error (r, record->pos,
+ _("Very long string %s overflows dictionary."),
+ var_get_name (var));
+ return false;
+ }
+
+ /* Get the short names from the segments and check their
+ lengths. */
+ for (i = 0; i < segment_cnt; i++)
+ {
+ struct variable *seg = dict_get_var (dict, idx + i);
+ int alloc_width = sfm_segment_alloc_width (length, i);
+ int width = var_get_width (seg);
+
+ if (i > 0)
+ var_set_short_name (var, i, var_get_short_name (seg, 0));
+ if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
+ {
+ sys_error (r, record->pos,
+ _("Very long string with width %ld has segment %d "
+ "of width %d (expected %d)."),
+ length, i, width, alloc_width);
+ return false;
+ }
+ }
+ dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
+ var_set_width (var, length);
+ }
+ close_text_record (r, text);
+ dict_compact_values (dict);
+
+ return true;
+}
+
+static bool
+parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
+ const struct sfm_var_record *var_recs, size_t n_var_recs,
+ const struct sfm_value_label_record *record)
+{
+ struct variable **vars;
+ char **utf8_labels;
+ size_t i;
+
+ utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
+ for (i = 0; i < record->n_labels; i++)
+ utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
+ record->labels[i].label, -1,
+ r->pool);
+
+ vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
+ for (i = 0; i < record->n_vars; i++)
+ {
+ vars[i] = lookup_var_by_index (r, record->pos,
+ var_recs, n_var_recs, record->vars[i]);
+ if (vars[i] == NULL)
+ return false;
+ }
+
+ for (i = 1; i < record->n_vars; i++)
+ if (var_get_type (vars[i]) != var_get_type (vars[0]))
+ {
+ sys_error (r, record->pos,
+ _("Variables associated with value label are not all of "
+ "identical type. Variable %s is %s, but variable "
+ "%s is %s."),
+ var_get_name (vars[0]),
+ var_is_numeric (vars[0]) ? _("numeric") : _("string"),
+ var_get_name (vars[i]),
+ var_is_numeric (vars[i]) ? _("numeric") : _("string"));
+ return false;
+ }
+
+ for (i = 0; i < record->n_vars; i++)
+ {
+ struct variable *var = vars[i];
+ int width;
+ size_t j;
+
+ width = var_get_width (var);
+ if (width > 8)
+ {
+ sys_error (r, record->pos,
+ _("Value labels may not be added to long string "
+ "variables (e.g. %s) using records types 3 and 4."),
+ var_get_name (var));
+ return false;
+ }
+
+ for (j = 0; j < record->n_labels; j++)
+ {
+ struct sfm_value_label *label = &record->labels[j];
+ union value value;
+
+ value_init (&value, width);
+ if (width == 0)
+ value.f = parse_float (r, label->value, 0);
+ else
+ memcpy (value_str_rw (&value, width), label->value, width);
+
+ if (!var_add_value_label (var, &value, utf8_labels[j]))
+ {
+ if (r->written_by_readstat)
+ {
+ /* Ignore the problem. ReadStat is buggy and emits value
+ labels whose values are longer than string variables'
+ widths, that are identical in the actual width of the
+ variable, e.g. both values "ABC123" and "ABC456" for a
+ string variable with width 3. */
+ }
+ else if (var_is_numeric (var))
+ sys_warn (r, record->pos,
+ _("Duplicate value label for %g on %s."),
+ value.f, var_get_name (var));
+ else
+ sys_warn (r, record->pos,
+ _("Duplicate value label for `%.*s' on %s."),
+ width, value_str (&value, width),
+ var_get_name (var));
+ }
+
+ value_destroy (&value, width);
+ }
+ }
+
+ pool_free (r->pool, vars);
+ for (i = 0; i < record->n_labels; i++)
+ pool_free (r->pool, utf8_labels[i]);
+ pool_free (r->pool, utf8_labels);
+
+ return true;
+}
+
+static struct variable *
+lookup_var_by_index (struct sfm_reader *r, off_t offset,
+ const struct sfm_var_record *var_recs, size_t n_var_recs,
+ int idx)
+{
+ const struct sfm_var_record *rec;
+
+ if (idx < 1 || idx > n_var_recs)
+ {
+ sys_error (r, offset,
+ _("Variable index %d not in valid range 1...%zu."),
+ idx, n_var_recs);
+ return NULL;
+ }
+
+ rec = &var_recs[idx - 1];
+ if (rec->var == NULL)
+ {
+ sys_error (r, offset,
+ _("Variable index %d refers to long string continuation."),
+ idx);
+ return NULL;
+ }
+
+ return rec->var;
+}
+
+/* Parses a set of custom attributes from TEXT into ATTRS.
+ ATTRS may be a null pointer, in which case the attributes are
+ read but discarded. */
+static void
+parse_attributes (struct sfm_reader *r, struct text_record *text,
+ struct attrset *attrs)
+{
+ do
+ {
+ struct attribute *attr;
+ char *key;
+ int index;
+
+ /* Parse the key. */
+ key = text_get_token (text, ss_cstr ("("), NULL);
+ if (key == NULL)
+ return;
+
+ attr = attribute_create (key);
+ for (index = 1; ; index++)
+ {
+ /* Parse the value. */
+ char *value;
+ size_t length;
+
+ value = text_get_token (text, ss_cstr ("\n"), NULL);
+ if (value == NULL)
+ {
+ text_warn (r, text, _("Error parsing attribute value %s[%d]."),
+ key, index);
+ break;
+ }
+
+ length = strlen (value);
+ if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
+ {
+ value[length - 1] = '\0';
+ attribute_add_value (attr, value + 1);
+ }
+ else
+ {
+ text_warn (r, text,
+ _("Attribute value %s[%d] is not quoted: %s."),
+ key, index, value);
+ attribute_add_value (attr, value);
+ }
+
+ /* Was this the last value for this attribute? */
+ if (text_match (text, ')'))
+ break;
+ }
+ if (attrs != NULL)
+ attrset_add (attrs, attr);
+ else
+ attribute_destroy (attr);
+ }
+ while (!text_match (text, '/'));
+}
+
+/* Reads record type 7, subtype 17, which lists custom
+ attributes on the data file. */
+static void
+parse_data_file_attributes (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text = open_text_record (r, record, true);
+ parse_attributes (r, text, dict_get_attributes (dict));
+ close_text_record (r, text);
+}
+
+/* Parses record type 7, subtype 18, which lists custom
+ attributes on individual variables. */
+static void
+parse_variable_attributes (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct variable *var;
+
+ text = open_text_record (r, record, true);
+ while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
+ parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
+ close_text_record (r, text);
+}
+
+static void
+assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
+{
+ size_t n_warnings = 0;
+ size_t i;
+
+ for (i = 0; i < dict_get_var_cnt (dict); i++)
+ {
+ struct variable *var = dict_get_var (dict, i);
+ struct attrset *attrs = var_get_attributes (var);
+ const struct attribute *attr = attrset_lookup (attrs, "$@Role");
+ if (attr != NULL)
+ {
+ int value = atoi (attribute_get_value (attr, 0));
+ enum var_role role;
+
+ switch (value)
+ {
+ case 0:
+ role = ROLE_INPUT;
+ break;
+
+ case 1:
+ role = ROLE_TARGET;
+ break;
+
+ case 2:
+ role = ROLE_BOTH;
+ break;
+
+ case 3:
+ role = ROLE_NONE;
+ break;
+
+ case 4:
+ role = ROLE_PARTITION;
+ break;
+
+ case 5:
+ role = ROLE_SPLIT;
+ break;
+
+ default:
+ role = ROLE_INPUT;
+ if (n_warnings++ == 0)
+ sys_warn (r, -1, _("Invalid role for variable %s."),
+ var_get_name (var));
+ }
+
+ var_set_role (var, role);
+ }
+ }
+
+ if (n_warnings > 1)
+ sys_warn (r, -1, _("%zu other variables had invalid roles."),
+ n_warnings - 1);
+}
+
+static bool
+check_overflow (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ size_t ofs, size_t length)
+{
+ size_t end = record->size * record->count;
+ if (length >= end || ofs + length > end)
+ {
+ sys_warn (r, record->pos + end,
+ _("Extension record subtype %d ends unexpectedly."),
+ record->subtype);
+ return false;
+ }
+ return true;
+}
+
+static void
+parse_long_string_value_labels (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ size_t end = record->size * record->count;
+ size_t ofs = 0;
+
+ while (ofs < end)
+ {
+ char *var_name;
+ size_t n_labels, i;
+ struct variable *var;
+ union value value;
+ int var_name_len;
+ int width;
+
+ /* Parse variable name length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ var_name_len = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse variable name, width, and number of labels. */
+ if (!check_overflow (r, record, ofs, var_name_len)
+ || !check_overflow (r, record, ofs, var_name_len + 8))
+ return;
+ var_name = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ var_name_len, r->pool);
+ width = parse_int (r, record->data, ofs + var_name_len);
+ n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
+ ofs += var_name_len + 8;
+
+ /* Look up 'var' and validate. */
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label record for "
+ "unknown variable %s."), var_name);
+ else if (var_is_numeric (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label record for "
+ "numeric variable %s."), var_name);
+ var = NULL;
+ }
+ else if (width != var_get_width (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label record for variable "
+ "%s because the record's width (%d) does not match the "
+ "variable's width (%d)."),
+ var_name, width, var_get_width (var));
+ var = NULL;
+ }
+
+ /* Parse values. */
+ value_init_pool (r->pool, &value, width);
+ for (i = 0; i < n_labels; i++)
+ {
+ size_t value_length, label_length;
+ bool skip = var == NULL;
+
+ /* Parse value length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ value_length = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse value. */
+ if (!check_overflow (r, record, ofs, value_length))
+ return;
+ if (!skip)
+ {
+ if (value_length == width)
+ memcpy (value_str_rw (&value, width),
+ (const uint8_t *) record->data + ofs, width);
+ else
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label %zu for "
+ "variable %s, with width %d, that has bad value "
+ "width %zu."),
+ i, var_get_name (var), width, value_length);
+ skip = true;
+ }
+ }
+ ofs += value_length;
+
+ /* Parse label length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ label_length = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse label. */
+ if (!check_overflow (r, record, ofs, label_length))
+ return;
+ if (!skip)
+ {
+ char *label;
+
+ label = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ label_length, r->pool);
+ if (!var_add_value_label (var, &value, label))
+ sys_warn (r, record->pos + ofs,
+ _("Duplicate value label for `%.*s' on %s."),
+ width, value_str (&value, width),
+ var_get_name (var));
+ pool_free (r->pool, label);
+ }
+ ofs += label_length;
+ }
+ }
+}
+
+static void
+parse_long_string_missing_values (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ size_t end = record->size * record->count;
+ size_t ofs = 0;
+
+ while (ofs < end)
+ {
+ struct missing_values mv;
+ char *var_name;
+ struct variable *var;
+ int n_missing_values;
+ int var_name_len;
+ size_t i;
+
+ /* Parse variable name length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ var_name_len = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse variable name. */
+ if (!check_overflow (r, record, ofs, var_name_len)
+ || !check_overflow (r, record, ofs, var_name_len + 1))
+ return;
+ var_name = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ var_name_len, r->pool);
+ ofs += var_name_len;
+
+ /* Parse number of missing values. */
+ n_missing_values = ((const uint8_t *) record->data)[ofs];
+ if (n_missing_values < 1 || n_missing_values > 3)
+ sys_warn (r, record->pos + ofs,
+ _("Long string missing values record says variable %s "
+ "has %d missing values, but only 1 to 3 missing values "
+ "are allowed."),
+ var_name, n_missing_values);
+ ofs++;
+
+ /* Look up 'var' and validate. */
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string missing value record for "
+ "unknown variable %s."), var_name);
+ else if (var_is_numeric (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string missing value record for "
+ "numeric variable %s."), var_name);
+ var = NULL;
+ }
+
+ /* Parse values. */
+ mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
+ for (i = 0; i < n_missing_values; i++)
+ {
+ size_t value_length;
+
+ /* Parse value length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ value_length = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse value. */
+ if (!check_overflow (r, record, ofs, value_length))
+ return;
+ if (var != NULL
+ && i < 3
+ && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
+ value_length))
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string missing value %zu for variable "
+ "%s, with width %d, that has bad value width %zu."),
+ i, var_get_name (var), var_get_width (var),
+ value_length);
+ ofs += value_length;
+ }
+ if (var != NULL)
+ var_set_missing_values (var, &mv);
+ }
+}
+\f
+/* Case reader. */
+
+static void partial_record (struct sfm_reader *);
+
+static void read_error (struct casereader *, const struct sfm_reader *);
+
+static bool read_case_number (struct sfm_reader *, double *);
+static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
+static int read_opcode (struct sfm_reader *);
+static bool read_compressed_number (struct sfm_reader *, double *);
+static int read_compressed_string (struct sfm_reader *, uint8_t *);
+static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
+static bool skip_whole_strings (struct sfm_reader *, size_t);
+
+/* Reads and returns one case from READER's file. Returns a null
+ pointer if not successful. */
+static struct ccase *
+sys_file_casereader_read (struct casereader *reader, void *r_)
+{
+ struct sfm_reader *r = r_;
+ struct ccase *c;
+ int retval;
+ int i;
+
+ if (r->error || !r->sfm_var_cnt)
+ return NULL;
+
+ c = case_create (r->proto);
+
+ for (i = 0; i < r->sfm_var_cnt; i++)
+ {
+ struct sfm_var *sv = &r->sfm_vars[i];
+ union value *v = case_data_rw_idx (c, sv->case_index);
+
+ if (sv->var_width == 0)
+ retval = read_case_number (r, &v->f);
+ else
+ {
+ uint8_t *s = value_str_rw (v, sv->var_width);
+ retval = read_case_string (r, s + sv->offset, sv->segment_width);
+ if (retval == 1)
+ {
+ retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
+ if (retval == 0)
+ sys_error (r, r->pos, _("File ends in partial string value."));
+ }
+ }
+
+ if (retval != 1)
+ goto eof;
+ }
+ return c;
+
+eof:
+ if (i != 0)
+ partial_record (r);
+ if (r->case_cnt != -1)
+ read_error (reader, r);
+ case_unref (c);
+ return NULL;
+}
+
+/* Issues an error that R ends in a partial record. */
+static void
+partial_record (struct sfm_reader *r)
+{
+ sys_error (r, r->pos, _("File ends in partial case."));
+}
+
+/* Issues an error that an unspecified error occurred SFM, and
+ marks R tainted. */
+static void
+read_error (struct casereader *r, const struct sfm_reader *sfm)
+{
+ msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
+ casereader_force_error (r);
+}
+
+/* Reads a number from R and stores its value in *D.
+ If R is compressed, reads a compressed number;
+ otherwise, reads a number in the regular way.
+ Returns true if successful, false if end of file is
+ reached immediately. */
+static bool
+read_case_number (struct sfm_reader *r, double *d)
+{
+ if (r->compression == ANY_COMP_NONE)
+ {
+ uint8_t number[8];
+ if (!try_read_bytes (r, number, sizeof number))
+ return false;
+ float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
+ return true;
+ }
+ else
+ return read_compressed_number (r, d);
+}
+
+/* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
+ bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
+ discarded without being written to S. Reads compressed strings if S is
+ compressed. Returns 1 if successful, 0 if end of file is reached
+ immediately, or -1 for some kind of error. */
+static int
+read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
+{
+ size_t whole = ROUND_DOWN (length, 8);
+ size_t partial = length % 8;
+
+ if (whole)
+ {
+ int retval = read_whole_strings (r, s, whole);
+ if (retval != 1)
+ return retval;
+ }
+
+ if (partial)
+ {
+ uint8_t bounce[8];
+ int retval = read_whole_strings (r, bounce, sizeof bounce);
+ if (retval == -1)
+ return -1;
+ else if (!retval)
+ {
+ if (whole)
+ {
+ partial_record (r);
+ return -1;
+ }
+ return 0;
+ }
+ memcpy (s + whole, bounce, partial);
+ }
+
+ return 1;
+}
+
+/* Reads and returns the next compression opcode from R. */
+static int
+read_opcode (struct sfm_reader *r)
+{
+ assert (r->compression != ANY_COMP_NONE);
+ for (;;)
+ {
+ int opcode;
+ if (r->opcode_idx >= sizeof r->opcodes)
+ {
+
+ int retval = try_read_compressed_bytes (r, r->opcodes,
+ sizeof r->opcodes);
+ if (retval != 1)
+ return -1;
+ r->opcode_idx = 0;
+ }
+ opcode = r->opcodes[r->opcode_idx++];
+
+ if (opcode != 0)
+ return opcode;
+ }
+}
+
+/* Reads a compressed number from R and stores its value in D.
+ Returns true if successful, false if end of file is
+ reached immediately. */
+static bool
+read_compressed_number (struct sfm_reader *r, double *d)
+{
+ int opcode = read_opcode (r);
+ switch (opcode)
+ {
+ case -1:
+ case 252:
+ return false;
+
+ case 253:
+ return read_compressed_float (r, d);
+
+ case 254:
+ float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
+ if (!r->corruption_warning)
+ {
+ r->corruption_warning = true;
+ sys_warn (r, r->pos,
+ _("Possible compressed data corruption: "
+ "compressed spaces appear in numeric field."));
+ }
+ break;
+
+ case 255:
+ *d = SYSMIS;
+ break;
+
+ default:
+ *d = opcode - r->bias;
+ break;
+ }
+
+ return true;
+}
+
+/* Reads a compressed 8-byte string segment from R and stores it in DST. */
+static int
+read_compressed_string (struct sfm_reader *r, uint8_t *dst)
+{
+ int opcode;
+ int retval;
+
+ opcode = read_opcode (r);
+ switch (opcode)
+ {
+ case -1:
+ case 252:
+ return 0;
+
+ case 253:
+ retval = read_compressed_bytes (r, dst, 8);
+ return retval == 1 ? 1 : -1;
+
+ case 254:
+ memset (dst, ' ', 8);
+ return 1;
+
+ default:
+ {
+ double value = opcode - r->bias;
+ float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
+ if (value == 0.0)
+ {
+ /* This has actually been seen "in the wild". The submitter of the
+ file that showed that the contents decoded as spaces, but they
+ were at the end of the field so it's possible that the null
+ bytes just acted as null terminators. */
+ }
+ else if (!r->corruption_warning)
+ {
+ r->corruption_warning = true;
+ sys_warn (r, r->pos,
+ _("Possible compressed data corruption: "
+ "string contains compressed integer (opcode %d)."),
+ opcode);
+ }
+ }
+ return 1;
+ }
+}
+
+/* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
+ Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
+ end of file is reached immediately, or -1 for some kind of error. */
+static int
+read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
+{
+ assert (length % 8 == 0);
+ if (r->compression == ANY_COMP_NONE)
+ return try_read_bytes (r, s, length);
+ else
+ {
+ size_t ofs;
+
+ for (ofs = 0; ofs < length; ofs += 8)
+ {
+ int retval = read_compressed_string (r, s + ofs);
+ if (retval != 1)
+ {
+ if (ofs != 0)
+ {
+ partial_record (r);
+ return -1;
+ }
+ return retval;
+ }
+ }
+ return 1;
+ }
+}
+
+/* Skips LENGTH string bytes from R.
+ LENGTH must be a multiple of 8.
+ (LENGTH is also limited to 1024, but that's only because the
+ current caller never needs more than that many bytes.)
+ Returns true if successful, false if end of file is
+ reached immediately. */
+static bool
+skip_whole_strings (struct sfm_reader *r, size_t length)
+{
+ uint8_t buffer[1024];
+ assert (length < sizeof buffer);
+ return read_whole_strings (r, buffer, length);
+}
+\f
+/* Helpers for reading records that contain structured text
+ strings. */
+
+/* Maximum number of warnings to issue for a single text
+ record. */
+#define MAX_TEXT_WARNINGS 5
+
+/* State. */
+struct text_record
+ {
+ struct substring buffer; /* Record contents. */
+ off_t start; /* Starting offset in file. */
+ size_t pos; /* Current position in buffer. */
+ int n_warnings; /* Number of warnings issued or suppressed. */
+ bool recoded; /* Recoded into UTF-8? */
+ };
+
+static struct text_record *
+open_text_record (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ bool recode_to_utf8)
+{
+ struct text_record *text;
+ struct substring raw;
+
+ text = pool_alloc (r->pool, sizeof *text);
+ raw = ss_buffer (record->data, record->size * record->count);
+ text->start = record->pos;
+ text->buffer = (recode_to_utf8
+ ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
+ : raw);
+ text->pos = 0;
+ text->n_warnings = 0;
+ text->recoded = recode_to_utf8;
+
+ return text;
+}
+
+/* Closes TEXT, frees its storage, and issues a final warning
+ about suppressed warnings if necesary. */
+static void
+close_text_record (struct sfm_reader *r, struct text_record *text)
+{
+ if (text->n_warnings > MAX_TEXT_WARNINGS)
+ sys_warn (r, -1, _("Suppressed %d additional related warnings."),
+ text->n_warnings - MAX_TEXT_WARNINGS);
+ if (text->recoded)
+ pool_free (r->pool, ss_data (text->buffer));
+}
+
+/* Reads a variable=value pair from TEXT.
+ Looks up the variable in DICT and stores it into *VAR.
+ Stores a null-terminated value into *VALUE. */
+static bool
+read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text,
+ struct variable **var, char **value)
+{
+ for (;;)
+ {
+ if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
+ return false;
+
+ *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
+ if (*value == NULL)
+ return false;
+
+ text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
+ ss_buffer ("\t\0", 2));
+
+ if (*var != NULL)
+ return true;
+ }
+}
+
+static bool
+text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text, struct substring delimiters,
+ struct variable **var)
+{
+ char *name;
+
+ name = text_get_token (text, delimiters, NULL);
+ if (name == NULL)
+ return false;
+
+ *var = dict_lookup_var (dict, name);
+ if (*var != NULL)
+ return true;