1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/message.h"
49 #include "libpspp/misc.h"
50 #include "libpspp/pool.h"
51 #include "libpspp/str.h"
52 #include "libpspp/stringi-set.h"
54 #include "gl/c-strtod.h"
55 #include "gl/c-ctype.h"
56 #include "gl/inttostr.h"
57 #include "gl/localcharset.h"
58 #include "gl/minmax.h"
59 #include "gl/unlocked-io.h"
60 #include "gl/xalloc.h"
61 #include "gl/xalloc-oversized.h"
65 #define _(msgid) gettext (msgid)
66 #define N_(msgid) (msgid)
70 /* subtypes 0-2 unknown */
71 EXT_INTEGER = 3, /* Machine integer info. */
72 EXT_FLOAT = 4, /* Machine floating-point info. */
73 EXT_VAR_SETS = 5, /* Variable sets. */
74 EXT_DATE = 6, /* DATE. */
75 EXT_MRSETS = 7, /* Multiple response sets. */
76 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
77 /* subtype 9 unknown */
78 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
79 EXT_DISPLAY = 11, /* Variable display parameters. */
80 /* subtype 12 unknown */
81 EXT_LONG_NAMES = 13, /* Long variable names. */
82 EXT_LONG_STRINGS = 14, /* Long strings. */
83 /* subtype 15 unknown */
84 EXT_NCASES = 16, /* Extended number of cases. */
85 EXT_FILE_ATTRS = 17, /* Data file attributes. */
86 EXT_VAR_ATTRS = 18, /* Variable attributes. */
87 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
88 EXT_ENCODING = 20, /* Character encoding. */
89 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
90 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
91 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
94 /* Fields from the top-level header record. */
95 struct sfm_header_record
97 char magic[5]; /* First 4 bytes of file, then null. */
98 int weight_idx; /* 0 if unweighted, otherwise a var index. */
99 int nominal_case_size; /* Number of var positions. */
101 /* These correspond to the members of struct sfm_file_info or a dictionary
102 but in the system file's encoding rather than ASCII. */
103 char creation_date[10]; /* "dd mmm yy". */
104 char creation_time[9]; /* "hh:mm:ss". */
105 char eye_catcher[61]; /* Eye-catcher string, then product name. */
106 char file_label[65]; /* File label. */
109 struct sfm_var_record
116 int missing_value_code;
119 struct variable *var;
122 struct sfm_value_label
128 struct sfm_value_label_record
131 struct sfm_value_label *labels;
132 unsigned int n_labels;
138 struct sfm_document_record
145 struct sfm_extension_record
147 int subtype; /* Record subtype. */
148 off_t pos; /* Starting offset in file. */
149 size_t size; /* Size of data elements. */
150 size_t count; /* Number of data elements. */
151 void *data; /* Contents. */
154 /* System file reader. */
157 /* Resource tracking. */
158 struct pool *pool; /* All system file state. */
161 struct file_handle *fh; /* File handle. */
162 struct fh_lock *lock; /* Mutual exclusion for file handle. */
163 FILE *file; /* File stream. */
164 off_t pos; /* Position in file. */
165 bool error; /* I/O or corruption error? */
166 struct caseproto *proto; /* Format of output cases. */
169 enum integer_format integer_format; /* On-disk integer format. */
170 enum float_format float_format; /* On-disk floating point format. */
171 struct sfm_var *sfm_vars; /* Variables. */
172 size_t sfm_var_cnt; /* Number of variables. */
173 int case_cnt; /* Number of cases */
174 const char *encoding; /* String encoding. */
177 enum sfm_compression compression;
178 double bias; /* Compression bias, usually 100.0. */
179 uint8_t opcodes[8]; /* Current block of opcodes. */
180 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
181 bool corruption_warning; /* Warned about possible corruption? */
183 /* ZLIB decompression. */
184 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
185 #define ZIN_BUF_SIZE 4096
186 uint8_t *zin_buf; /* Inflation input buffer. */
187 #define ZOUT_BUF_SIZE 16384
188 uint8_t *zout_buf; /* Inflation output buffer. */
189 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
190 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
191 z_stream zstream; /* ZLIB inflater. */
194 static const struct casereader_class sys_file_casereader_class;
196 static bool close_reader (struct sfm_reader *);
198 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
199 const struct sfm_var_record *,
202 static void sys_msg (struct sfm_reader *r, off_t, int class,
203 const char *format, va_list args)
204 PRINTF_FORMAT (4, 0);
205 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
206 PRINTF_FORMAT (3, 4);
207 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
208 PRINTF_FORMAT (3, 4);
210 static bool read_bytes (struct sfm_reader *, void *, size_t)
212 static int try_read_bytes (struct sfm_reader *, void *, size_t)
214 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
215 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
216 static bool read_int64 (struct sfm_reader *, long long int *)
218 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
220 static bool read_string (struct sfm_reader *, char *, size_t)
222 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
224 /* ZLIB compressed data handling. */
225 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
226 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
227 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
228 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
230 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
232 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
234 static bool read_compressed_float (struct sfm_reader *, double *)
237 static char *fix_line_ends (const char *);
239 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
240 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
242 static bool read_variable_record (struct sfm_reader *,
243 struct sfm_var_record *);
244 static bool read_value_label_record (struct sfm_reader *,
245 struct sfm_value_label_record *,
247 static struct sfm_document_record *read_document_record (struct sfm_reader *);
248 static bool read_extension_record (struct sfm_reader *, int subtype,
249 struct sfm_extension_record **);
250 static bool skip_extension_record (struct sfm_reader *, int subtype);
252 static const char *choose_encoding (
254 const struct sfm_header_record *,
255 const struct sfm_extension_record *ext_integer,
256 const struct sfm_extension_record *ext_encoding);
258 static struct text_record *open_text_record (
259 struct sfm_reader *, const struct sfm_extension_record *,
260 bool recode_to_utf8);
261 static void close_text_record (struct sfm_reader *,
262 struct text_record *);
263 static bool read_variable_to_value_pair (struct sfm_reader *,
265 struct text_record *,
266 struct variable **var, char **value);
267 static void text_warn (struct sfm_reader *r, struct text_record *text,
268 const char *format, ...)
269 PRINTF_FORMAT (3, 4);
270 static char *text_get_token (struct text_record *,
271 struct substring delimiters, char *delimiter);
272 static bool text_match (struct text_record *, char c);
273 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
274 struct text_record *,
275 struct substring delimiters,
277 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
278 struct text_record *,
279 struct substring delimiters,
281 static const char *text_parse_counted_string (struct sfm_reader *,
282 struct text_record *);
283 static size_t text_pos (const struct text_record *);
284 static const char *text_get_all (const struct text_record *);
286 static bool close_reader (struct sfm_reader *r);
288 /* Dictionary reader. */
296 static bool read_header (struct sfm_reader *, struct sfm_read_info *,
297 struct sfm_header_record *);
298 static void parse_header (struct sfm_reader *,
299 const struct sfm_header_record *,
300 struct sfm_read_info *, struct dictionary *);
301 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
302 struct sfm_var_record *, size_t n);
303 static void parse_format_spec (struct sfm_reader *, off_t pos,
304 unsigned int format, enum which_format,
305 struct variable *, int *format_warning_cnt);
306 static void parse_document (struct dictionary *, struct sfm_document_record *);
307 static void parse_display_parameters (struct sfm_reader *,
308 const struct sfm_extension_record *,
309 struct dictionary *);
310 static bool parse_machine_integer_info (struct sfm_reader *,
311 const struct sfm_extension_record *,
312 struct sfm_read_info *);
313 static void parse_machine_float_info (struct sfm_reader *,
314 const struct sfm_extension_record *);
315 static void parse_extra_product_info (struct sfm_reader *,
316 const struct sfm_extension_record *,
317 struct sfm_read_info *);
318 static void parse_mrsets (struct sfm_reader *,
319 const struct sfm_extension_record *,
320 struct dictionary *);
321 static void parse_long_var_name_map (struct sfm_reader *,
322 const struct sfm_extension_record *,
323 struct dictionary *);
324 static bool parse_long_string_map (struct sfm_reader *,
325 const struct sfm_extension_record *,
326 struct dictionary *);
327 static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
328 const struct sfm_var_record *,
330 const struct sfm_value_label_record *);
331 static void parse_data_file_attributes (struct sfm_reader *,
332 const struct sfm_extension_record *,
333 struct dictionary *);
334 static void parse_variable_attributes (struct sfm_reader *,
335 const struct sfm_extension_record *,
336 struct dictionary *);
337 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
338 static bool parse_long_string_value_labels (struct sfm_reader *,
339 const struct sfm_extension_record *,
340 struct dictionary *);
341 static bool parse_long_string_missing_values (
342 struct sfm_reader *, const struct sfm_extension_record *,
343 struct dictionary *);
345 /* Frees the strings inside INFO. */
347 sfm_read_info_destroy (struct sfm_read_info *info)
351 free (info->creation_date);
352 free (info->creation_time);
353 free (info->product);
354 free (info->product_ext);
358 /* Opens the system file designated by file handle FH for reading. Reads the
359 system file's dictionary into *DICT.
361 Ordinarily the reader attempts to automatically detect the character
362 encoding based on the file's contents. This isn't always possible,
363 especially for files written by old versions of SPSS or PSPP, so specifying
364 a nonnull ENCODING overrides the choice of character encoding.
366 If INFO is non-null, then it receives additional info about the system file,
367 which the caller must eventually free with sfm_read_info_destroy() when it
368 is no longer needed. */
370 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
371 struct dictionary **dictp, struct sfm_read_info *infop)
373 struct sfm_reader *r = NULL;
374 struct sfm_read_info *volatile info;
376 struct sfm_header_record header;
378 struct sfm_var_record *vars;
379 size_t n_vars, allocated_vars;
381 struct sfm_value_label_record *labels;
382 size_t n_labels, allocated_labels;
384 struct sfm_document_record *document;
386 struct sfm_extension_record *extensions[32];
388 struct dictionary *dict = NULL;
391 /* Create and initialize reader. */
392 r = pool_create_container (struct sfm_reader, pool);
398 r->opcode_idx = sizeof r->opcodes;
399 r->corruption_warning = false;
400 r->zin_buf = r->zout_buf = NULL;
402 info = infop ? infop : xmalloc (sizeof *info);
403 memset (info, 0, sizeof *info);
405 /* TRANSLATORS: this fragment will be interpolated into
406 messages in fh_lock() that identify types of files. */
407 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
411 r->file = fn_open (fh_get_file_name (fh), "rb");
414 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
415 fh_get_file_name (r->fh), strerror (errno));
420 if (!read_header (r, info, &header))
424 n_vars = allocated_vars = 0;
427 n_labels = allocated_labels = 0;
431 memset (extensions, 0, sizeof extensions);
439 if (!read_int (r, &type))
444 if (!read_int (r, &dummy))
452 if (n_vars >= allocated_vars)
453 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
455 ok = read_variable_record (r, &vars[n_vars++]);
459 if (n_labels >= allocated_labels)
460 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
462 ok = read_value_label_record (r, &labels[n_labels++], n_vars);
466 /* A Type 4 record is always immediately after a type 3 record,
467 so the code for type 3 records reads the type 4 record too. */
468 sys_error (r, r->pos, _("Misplaced type 4 record."));
473 if (document != NULL)
475 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
479 document = read_document_record (r);
480 ok = document != NULL;
484 if (!read_int (r, &subtype))
486 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
489 _("Unrecognized record type 7, subtype %d. Please "
490 "send a copy of this file, and the syntax which "
491 "created it to %s."),
492 subtype, PACKAGE_BUGREPORT);
493 ok = skip_extension_record (r, subtype);
495 else if (extensions[subtype] != NULL)
498 _("Record type 7, subtype %d found here has the same "
499 "type as the record found near offset 0x%llx. "
500 "Please send a copy of this file, and the syntax "
501 "which created it to %s."),
502 subtype, (long long int) extensions[subtype]->pos,
504 ok = skip_extension_record (r, subtype);
507 ok = read_extension_record (r, subtype, &extensions[subtype]);
511 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
519 if (r->compression == SFM_COMP_ZLIB && !read_zheader (r))
522 /* Now actually parse what we read.
524 First, figure out the correct character encoding, because this determines
525 how the rest of the header data is to be interpreted. */
526 dict = dict_create (encoding
528 : choose_encoding (r, &header, extensions[EXT_INTEGER],
529 extensions[EXT_ENCODING]));
530 r->encoding = dict_get_encoding (dict);
532 /* These records don't use variables at all. */
533 if (document != NULL)
534 parse_document (dict, document);
536 if (extensions[EXT_INTEGER] != NULL
537 && !parse_machine_integer_info (r, extensions[EXT_INTEGER], info))
540 if (extensions[EXT_FLOAT] != NULL)
541 parse_machine_float_info (r, extensions[EXT_FLOAT]);
543 if (extensions[EXT_PRODUCT_INFO] != NULL)
544 parse_extra_product_info (r, extensions[EXT_PRODUCT_INFO], info);
546 if (extensions[EXT_FILE_ATTRS] != NULL)
547 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
549 parse_header (r, &header, info, dict);
551 /* Parse the variable records, the basis of almost everything else. */
552 if (!parse_variable_records (r, dict, vars, n_vars))
555 /* Parse value labels and the weight variable immediately after the variable
556 records. These records use indexes into var_recs[], so we must parse them
557 before those indexes become invalidated by very long string variables. */
558 for (i = 0; i < n_labels; i++)
559 if (!parse_value_labels (r, dict, vars, n_vars, &labels[i]))
561 if (header.weight_idx != 0)
563 struct variable *weight_var = lookup_var_by_index (r, 76, vars, n_vars,
565 if (weight_var != NULL)
567 if (var_is_numeric (weight_var))
568 dict_set_weight (dict, weight_var);
570 sys_warn (r, -1, _("Ignoring string variable `%s' set "
571 "as weighting variable."),
572 var_get_name (weight_var));
576 if (extensions[EXT_DISPLAY] != NULL)
577 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
579 /* The following records use short names, so they need to be parsed before
580 parse_long_var_name_map() changes short names to long names. */
581 if (extensions[EXT_MRSETS] != NULL)
582 parse_mrsets (r, extensions[EXT_MRSETS], dict);
584 if (extensions[EXT_MRSETS2] != NULL)
585 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
587 if (extensions[EXT_LONG_STRINGS] != NULL
588 && !parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict))
591 /* Now rename variables to their long names. */
592 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
594 /* The following records use long names, so they need to follow renaming. */
595 if (extensions[EXT_VAR_ATTRS] != NULL)
597 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
599 /* Roles use the $@Role attribute. */
600 assign_variable_roles (r, dict);
603 if (extensions[EXT_LONG_LABELS] != NULL
604 && !parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS],
607 if (extensions[EXT_LONG_MISSING] != NULL
608 && !parse_long_string_missing_values (r, extensions[EXT_LONG_MISSING],
612 /* Warn if the actual amount of data per case differs from the
613 amount that the header claims. SPSS version 13 gets this
614 wrong when very long strings are involved, so don't warn in
616 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
617 && info->version_major != 13)
618 sys_warn (r, -1, _("File header claims %d variable positions but "
619 "%zu were read from file."),
620 header.nominal_case_size, n_vars);
622 /* Create an index of dictionary variable widths for
623 sfm_read_case to use. We cannot use the `struct variable's
624 from the dictionary we created, because the caller owns the
625 dictionary and may destroy or modify its variables. */
626 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
627 pool_register (r->pool, free, r->sfm_vars);
628 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
633 sfm_read_info_destroy (info);
637 return casereader_create_sequential
639 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
640 &sys_file_casereader_class, r);
645 sfm_read_info_destroy (info);
655 /* Closes a system file after we're done with it.
656 Returns true if an I/O error has occurred on READER, false
659 close_reader (struct sfm_reader *r)
668 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
670 msg (ME, _("Error closing system file `%s': %s."),
671 fh_get_file_name (r->fh), strerror (errno));
681 pool_destroy (r->pool);
686 /* Destroys READER. */
688 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
690 struct sfm_reader *r = r_;
694 /* Returns true if FILE is an SPSS system file,
697 sfm_detect (FILE *file)
701 if (fread (magic, 4, 1, file) != 1)
705 return (!strcmp (ASCII_MAGIC, magic)
706 || !strcmp (ASCII_ZMAGIC, magic)
707 || !strcmp (EBCDIC_MAGIC, magic));
710 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
711 except for the string fields in *INFO, which parse_header() will initialize
712 later once the file's encoding is known. */
714 read_header (struct sfm_reader *r, struct sfm_read_info *info,
715 struct sfm_header_record *header)
717 uint8_t raw_layout_code[4];
722 if (!read_string (r, header->magic, sizeof header->magic)
723 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
726 if (!strcmp (ASCII_MAGIC, header->magic)
727 || !strcmp (EBCDIC_MAGIC, header->magic))
729 else if (!strcmp (ASCII_ZMAGIC, header->magic))
733 sys_error (r, 0, _("This is not an SPSS system file."));
737 /* Identify integer format. */
738 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
740 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
742 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
744 || (r->integer_format != INTEGER_MSB_FIRST
745 && r->integer_format != INTEGER_LSB_FIRST))
747 sys_error (r, 64, _("This is not an SPSS system file."));
751 if (!read_int (r, &header->nominal_case_size))
754 if (header->nominal_case_size < 0
755 || header->nominal_case_size > INT_MAX / 16)
756 header->nominal_case_size = -1;
758 if (!read_int (r, &compressed))
763 r->compression = SFM_COMP_NONE;
764 else if (compressed == 1)
765 r->compression = SFM_COMP_SIMPLE;
766 else if (compressed != 0)
768 sys_error (r, 0, "System file header has invalid compression "
769 "value %d.", compressed);
776 r->compression = SFM_COMP_ZLIB;
779 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
780 "compression value %d.", compressed);
785 if (!read_int (r, &header->weight_idx))
788 if (!read_int (r, &r->case_cnt))
790 if ( r->case_cnt > INT_MAX / 2)
793 /* Identify floating-point format and obtain compression bias. */
794 if (!read_bytes (r, raw_bias, sizeof raw_bias))
796 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
798 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
800 if (memcmp (raw_bias, zero_bias, 8))
801 sys_warn (r, r->pos - 8,
802 _("Compression bias is not the usual "
803 "value of 100, or system file uses unrecognized "
804 "floating-point format."));
807 /* Some software is known to write all-zeros to this
808 field. Such software also writes floating-point
809 numbers in the format that we expect by default
810 (it seems that all software most likely does, in
811 reality), so don't warn in this case. */
814 if (r->integer_format == INTEGER_MSB_FIRST)
815 r->float_format = FLOAT_IEEE_DOUBLE_BE;
817 r->float_format = FLOAT_IEEE_DOUBLE_LE;
819 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
821 if (!read_string (r, header->creation_date, sizeof header->creation_date)
822 || !read_string (r, header->creation_time, sizeof header->creation_time)
823 || !read_string (r, header->file_label, sizeof header->file_label)
824 || !skip_bytes (r, 3))
827 info->integer_format = r->integer_format;
828 info->float_format = r->float_format;
829 info->compression = r->compression;
830 info->case_cnt = r->case_cnt;
835 /* Reads a variable (type 2) record from R into RECORD. */
837 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
839 int has_variable_label;
841 memset (record, 0, sizeof *record);
843 record->pos = r->pos;
844 if (!read_int (r, &record->width)
845 || !read_int (r, &has_variable_label)
846 || !read_int (r, &record->missing_value_code)
847 || !read_int (r, &record->print_format)
848 || !read_int (r, &record->write_format)
849 || !read_bytes (r, record->name, sizeof record->name))
852 if (has_variable_label == 1)
854 enum { MAX_LABEL_LEN = 255 };
855 unsigned int len, read_len;
857 if (!read_uint (r, &len))
860 /* Read up to MAX_LABEL_LEN bytes of label. */
861 read_len = MIN (MAX_LABEL_LEN, len);
862 record->label = pool_malloc (r->pool, read_len + 1);
863 if (!read_string (r, record->label, read_len + 1))
866 /* Skip unread label bytes. */
867 if (!skip_bytes (r, len - read_len))
870 /* Skip label padding up to multiple of 4 bytes. */
871 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
874 else if (has_variable_label != 0)
876 sys_error (r, record->pos,
877 _("Variable label indicator field is not 0 or 1."));
881 /* Set missing values. */
882 if (record->missing_value_code != 0)
884 int code = record->missing_value_code;
885 if (record->width == 0)
887 if (code < -3 || code > 3 || code == -1)
889 sys_error (r, record->pos,
890 _("Numeric missing value indicator field is not "
891 "-3, -2, 0, 1, 2, or 3."));
897 if (code < 1 || code > 3)
899 sys_error (r, record->pos,
900 _("String missing value indicator field is not "
906 if (!read_bytes (r, record->missing, 8 * abs (code)))
913 /* Reads value labels from R into RECORD. */
915 read_value_label_record (struct sfm_reader *r,
916 struct sfm_value_label_record *record,
922 /* Read type 3 record. */
923 record->pos = r->pos;
924 if (!read_uint (r, &record->n_labels))
926 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
928 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
932 record->labels = pool_nmalloc (r->pool, record->n_labels,
933 sizeof *record->labels);
934 for (i = 0; i < record->n_labels; i++)
936 struct sfm_value_label *label = &record->labels[i];
937 unsigned char label_len;
940 if (!read_bytes (r, label->value, sizeof label->value))
943 /* Read label length. */
944 if (!read_bytes (r, &label_len, sizeof label_len))
946 padded_len = ROUND_UP (label_len + 1, 8);
948 /* Read label, padding. */
949 label->label = pool_malloc (r->pool, padded_len + 1);
950 if (!read_bytes (r, label->label, padded_len - 1))
952 label->label[label_len] = '\0';
955 /* Read record type of type 4 record. */
956 if (!read_int (r, &type))
960 sys_error (r, r->pos - 4,
961 _("Variable index record (type 4) does not immediately "
962 "follow value label record (type 3) as it should."));
966 /* Read number of variables associated with value label from type 4
968 if (!read_uint (r, &record->n_vars))
970 if (record->n_vars < 1 || record->n_vars > n_vars)
972 sys_error (r, r->pos - 4,
973 _("Number of variables associated with a value label (%zu) "
974 "is not between 1 and the number of variables (%zu)."),
975 record->n_vars, n_vars);
979 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
980 for (i = 0; i < record->n_vars; i++)
981 if (!read_int (r, &record->vars[i]))
987 /* Reads a document record from R and returns it. */
988 static struct sfm_document_record *
989 read_document_record (struct sfm_reader *r)
991 struct sfm_document_record *record;
994 record = pool_malloc (r->pool, sizeof *record);
995 record->pos = r->pos;
997 if (!read_int (r, &n_lines))
999 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1001 sys_error (r, record->pos,
1002 _("Number of document lines (%d) "
1003 "must be greater than 0 and less than %d."),
1004 n_lines, INT_MAX / DOC_LINE_LENGTH);
1008 record->n_lines = n_lines;
1009 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1010 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1017 read_extension_record_header (struct sfm_reader *r, int subtype,
1018 struct sfm_extension_record *record)
1020 record->subtype = subtype;
1021 record->pos = r->pos;
1022 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1025 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1026 allows an extra byte for a null terminator, used by some
1027 extension processing routines. */
1028 if (record->size != 0
1029 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1031 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1039 /* Reads an extension record from R into RECORD. */
1041 read_extension_record (struct sfm_reader *r, int subtype,
1042 struct sfm_extension_record **recordp)
1044 struct extension_record_type
1051 static const struct extension_record_type types[] =
1053 /* Implemented record types. */
1054 { EXT_INTEGER, 4, 8 },
1055 { EXT_FLOAT, 8, 3 },
1056 { EXT_MRSETS, 1, 0 },
1057 { EXT_PRODUCT_INFO, 1, 0 },
1058 { EXT_DISPLAY, 4, 0 },
1059 { EXT_LONG_NAMES, 1, 0 },
1060 { EXT_LONG_STRINGS, 1, 0 },
1061 { EXT_NCASES, 8, 2 },
1062 { EXT_FILE_ATTRS, 1, 0 },
1063 { EXT_VAR_ATTRS, 1, 0 },
1064 { EXT_MRSETS2, 1, 0 },
1065 { EXT_ENCODING, 1, 0 },
1066 { EXT_LONG_LABELS, 1, 0 },
1067 { EXT_LONG_MISSING, 1, 0 },
1069 /* Ignored record types. */
1070 { EXT_VAR_SETS, 0, 0 },
1072 { EXT_DATA_ENTRY, 0, 0 },
1073 { EXT_DATAVIEW, 0, 0 },
1076 const struct extension_record_type *type;
1077 struct sfm_extension_record *record;
1081 record = pool_malloc (r->pool, sizeof *record);
1082 if (!read_extension_record_header (r, subtype, record))
1084 n_bytes = record->count * record->size;
1086 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1087 if (subtype == type->subtype)
1089 if (type->size > 0 && record->size != type->size)
1090 sys_warn (r, record->pos,
1091 _("Record type 7, subtype %d has bad size %zu "
1092 "(expected %d)."), subtype, record->size, type->size);
1093 else if (type->count > 0 && record->count != type->count)
1094 sys_warn (r, record->pos,
1095 _("Record type 7, subtype %d has bad count %zu "
1096 "(expected %d)."), subtype, record->count, type->count);
1097 else if (type->count == 0 && type->size == 0)
1099 /* Ignore this record. */
1103 char *data = pool_malloc (r->pool, n_bytes + 1);
1104 data[n_bytes] = '\0';
1106 record->data = data;
1107 if (!read_bytes (r, record->data, n_bytes))
1116 sys_warn (r, record->pos,
1117 _("Unrecognized record type 7, subtype %d. Please send a "
1118 "copy of this file, and the syntax which created it to %s."),
1119 subtype, PACKAGE_BUGREPORT);
1122 return skip_bytes (r, n_bytes);
1126 skip_extension_record (struct sfm_reader *r, int subtype)
1128 struct sfm_extension_record record;
1130 return (read_extension_record_header (r, subtype, &record)
1131 && skip_bytes (r, record.count * record.size));
1135 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1136 struct sfm_read_info *info, struct dictionary *dict)
1138 const char *dict_encoding = dict_get_encoding (dict);
1139 struct substring product;
1140 struct substring label;
1143 /* Convert file label to UTF-8 and put it into DICT. */
1144 label = recode_substring_pool ("UTF-8", dict_encoding,
1145 ss_cstr (header->file_label), r->pool);
1146 ss_trim (&label, ss_cstr (" "));
1147 label.string[label.length] = '\0';
1148 fixed_label = fix_line_ends (label.string);
1149 dict_set_label (dict, fixed_label);
1152 /* Put creation date and time in UTF-8 into INFO. */
1153 info->creation_date = recode_string ("UTF-8", dict_encoding,
1154 header->creation_date, -1);
1155 info->creation_time = recode_string ("UTF-8", dict_encoding,
1156 header->creation_time, -1);
1158 /* Put product name into INFO, dropping eye-catcher string if present. */
1159 product = recode_substring_pool ("UTF-8", dict_encoding,
1160 ss_cstr (header->eye_catcher), r->pool);
1161 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1162 ss_trim (&product, ss_cstr (" "));
1163 info->product = ss_xstrdup (product);
1166 /* Reads a variable (type 2) record from R and adds the
1167 corresponding variable to DICT.
1168 Also skips past additional variable records for long string
1171 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1172 struct sfm_var_record *var_recs, size_t n_var_recs)
1174 const char *dict_encoding = dict_get_encoding (dict);
1175 struct sfm_var_record *rec;
1178 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1180 struct variable *var;
1185 name = recode_string_pool ("UTF-8", dict_encoding,
1186 rec->name, 8, r->pool);
1187 name[strcspn (name, " ")] = '\0';
1189 if (!dict_id_is_valid (dict, name, false)
1190 || name[0] == '$' || name[0] == '#')
1192 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1196 if (rec->width < 0 || rec->width > 255)
1198 sys_error (r, rec->pos,
1199 _("Bad width %d for variable %s."), rec->width, name);
1203 var = rec->var = dict_create_var (dict, name, rec->width);
1206 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1207 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1210 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1214 /* Set the short name the same as the long name. */
1215 var_set_short_name (var, 0, name);
1217 /* Get variable label, if any. */
1222 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1223 rec->label, -1, r->pool);
1224 var_set_label (var, utf8_label, false);
1227 /* Set missing values. */
1228 if (rec->missing_value_code != 0)
1230 int width = var_get_width (var);
1231 struct missing_values mv;
1233 mv_init_pool (r->pool, &mv, width);
1234 if (var_is_numeric (var))
1236 bool has_range = rec->missing_value_code < 0;
1237 int n_discrete = (has_range
1238 ? rec->missing_value_code == -3
1239 : rec->missing_value_code);
1244 double low = parse_float (r, rec->missing, 0);
1245 double high = parse_float (r, rec->missing, 8);
1247 /* Deal with SPSS 21 change in representation. */
1251 mv_add_range (&mv, low, high);
1255 for (i = 0; i < n_discrete; i++)
1257 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1265 value_init_pool (r->pool, &value, width);
1266 value_set_missing (&value, width);
1267 for (i = 0; i < rec->missing_value_code; i++)
1268 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1270 var_set_missing_values (var, &mv);
1274 parse_format_spec (r, rec->pos + 12, rec->print_format,
1275 PRINT_FORMAT, var, &n_warnings);
1276 parse_format_spec (r, rec->pos + 16, rec->write_format,
1277 WRITE_FORMAT, var, &n_warnings);
1279 /* Account for values.
1280 Skip long string continuation records, if any. */
1281 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1282 for (i = 1; i < n_values; i++)
1283 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1285 sys_error (r, rec->pos, _("Missing string continuation record."));
1294 /* Translates the format spec from sysfile format to internal
1297 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1298 enum which_format which, struct variable *v,
1301 const int max_warnings = 8;
1302 uint8_t raw_type = format >> 16;
1303 uint8_t w = format >> 8;
1312 ok = (fmt_from_io (raw_type, &f.type)
1313 && fmt_check_output (&f)
1314 && fmt_check_width_compat (&f, var_get_width (v)));
1319 if (which == PRINT_FORMAT)
1320 var_set_print_format (v, &f);
1322 var_set_write_format (v, &f);
1324 else if (format == 0)
1326 /* Actually observed in the wild. No point in warning about it. */
1328 else if (++*n_warnings <= max_warnings)
1330 if (which == PRINT_FORMAT)
1331 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1333 var_get_name (v), var_get_width (v), format);
1335 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1337 var_get_name (v), var_get_width (v), format);
1339 if (*n_warnings == max_warnings)
1340 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1345 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1349 for (p = record->documents;
1350 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1351 p += DOC_LINE_LENGTH)
1353 struct substring line;
1355 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1356 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1357 ss_rtrim (&line, ss_cstr (" "));
1358 line.string[line.length] = '\0';
1360 dict_add_document_line (dict, line.string, false);
1366 /* Parses record type 7, subtype 3. */
1368 parse_machine_integer_info (struct sfm_reader *r,
1369 const struct sfm_extension_record *record,
1370 struct sfm_read_info *info)
1372 int float_representation, expected_float_format;
1373 int integer_representation, expected_integer_format;
1375 /* Save version info. */
1376 info->version_major = parse_int (r, record->data, 0);
1377 info->version_minor = parse_int (r, record->data, 4);
1378 info->version_revision = parse_int (r, record->data, 8);
1380 /* Check floating point format. */
1381 float_representation = parse_int (r, record->data, 16);
1382 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1383 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1384 expected_float_format = 1;
1385 else if (r->float_format == FLOAT_Z_LONG)
1386 expected_float_format = 2;
1387 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1388 expected_float_format = 3;
1391 if (float_representation != expected_float_format)
1393 sys_error (r, record->pos,
1394 _("Floating-point representation indicated by "
1395 "system file (%d) differs from expected (%d)."),
1396 float_representation, expected_float_format);
1400 /* Check integer format. */
1401 integer_representation = parse_int (r, record->data, 24);
1402 if (r->integer_format == INTEGER_MSB_FIRST)
1403 expected_integer_format = 1;
1404 else if (r->integer_format == INTEGER_LSB_FIRST)
1405 expected_integer_format = 2;
1408 if (integer_representation != expected_integer_format)
1409 sys_warn (r, record->pos,
1410 _("Integer format indicated by system file (%d) "
1411 "differs from expected (%d)."),
1412 integer_representation, expected_integer_format);
1418 choose_encoding (struct sfm_reader *r,
1419 const struct sfm_header_record *header,
1420 const struct sfm_extension_record *ext_integer,
1421 const struct sfm_extension_record *ext_encoding)
1423 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1426 return ext_encoding->data;
1428 /* But EXT_INTEGER is better than nothing as a fallback. */
1431 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1432 const char *encoding;
1441 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1442 respectively. However, there are known to be many files in the wild
1443 with character code 2, yet have data which are clearly not ASCII.
1444 Therefore we ignore these values. */
1451 encoding = sys_get_encoding_from_codepage (codepage);
1452 if (encoding != NULL)
1458 /* If the file magic number is EBCDIC then its character data is too. */
1459 if (!strcmp (header->magic, EBCDIC_MAGIC))
1462 return locale_charset ();
1465 /* Parses record type 7, subtype 4. */
1467 parse_machine_float_info (struct sfm_reader *r,
1468 const struct sfm_extension_record *record)
1470 double sysmis = parse_float (r, record->data, 0);
1471 double highest = parse_float (r, record->data, 8);
1472 double lowest = parse_float (r, record->data, 16);
1474 if (sysmis != SYSMIS)
1475 sys_warn (r, record->pos,
1476 _("File specifies unexpected value %g (%a) as %s, "
1477 "instead of %g (%a)."),
1478 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1480 if (highest != HIGHEST)
1481 sys_warn (r, record->pos,
1482 _("File specifies unexpected value %g (%a) as %s, "
1483 "instead of %g (%a)."),
1484 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1486 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1487 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1488 appears in a context (missing values) where SYSMIS cannot. */
1489 if (lowest != LOWEST && lowest != SYSMIS)
1490 sys_warn (r, record->pos,
1491 _("File specifies unexpected value %g (%a) as %s, "
1492 "instead of %g (%a) or %g (%a)."),
1493 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1496 /* Parses record type 7, subtype 10. */
1498 parse_extra_product_info (struct sfm_reader *r,
1499 const struct sfm_extension_record *record,
1500 struct sfm_read_info *info)
1502 struct text_record *text;
1504 text = open_text_record (r, record, true);
1505 info->product_ext = fix_line_ends (text_get_all (text));
1506 close_text_record (r, text);
1509 /* Parses record type 7, subtype 7 or 19. */
1511 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1512 struct dictionary *dict)
1514 struct text_record *text;
1515 struct mrset *mrset;
1517 text = open_text_record (r, record, false);
1520 const char *counted = NULL;
1523 struct stringi_set var_names;
1524 size_t allocated_vars;
1528 /* Skip extra line feeds if present. */
1529 while (text_match (text, '\n'))
1532 mrset = xzalloc (sizeof *mrset);
1534 name = text_get_token (text, ss_cstr ("="), NULL);
1537 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1539 if (mrset->name[0] != '$')
1541 sys_warn (r, record->pos,
1542 _("`%s' does not begin with `$' at offset %zu "
1543 "in MRSETS record."), mrset->name, text_pos (text));
1547 if (text_match (text, 'C'))
1549 mrset->type = MRSET_MC;
1550 if (!text_match (text, ' '))
1552 sys_warn (r, record->pos,
1553 _("Missing space following `%c' at offset %zu "
1554 "in MRSETS record."), 'C', text_pos (text));
1558 else if (text_match (text, 'D'))
1560 mrset->type = MRSET_MD;
1561 mrset->cat_source = MRSET_VARLABELS;
1563 else if (text_match (text, 'E'))
1567 mrset->type = MRSET_MD;
1568 mrset->cat_source = MRSET_COUNTEDVALUES;
1569 if (!text_match (text, ' '))
1571 sys_warn (r, record->pos,
1572 _("Missing space following `%c' at offset %zu "
1573 "in MRSETS record."), 'E', text_pos (text));
1577 number = text_get_token (text, ss_cstr (" "), NULL);
1578 if (!strcmp (number, "11"))
1579 mrset->label_from_var_label = true;
1580 else if (strcmp (number, "1"))
1581 sys_warn (r, record->pos,
1582 _("Unexpected label source value `%s' following `E' "
1583 "at offset %zu in MRSETS record."),
1584 number, text_pos (text));
1588 sys_warn (r, record->pos,
1589 _("Missing `C', `D', or `E' at offset %zu "
1590 "in MRSETS record."),
1595 if (mrset->type == MRSET_MD)
1597 counted = text_parse_counted_string (r, text);
1598 if (counted == NULL)
1602 label = text_parse_counted_string (r, text);
1605 if (label[0] != '\0')
1606 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1608 stringi_set_init (&var_names);
1613 const char *raw_var_name;
1614 struct variable *var;
1617 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1618 if (raw_var_name == NULL)
1620 if (delimiter != '\n')
1621 sys_warn (r, record->pos,
1622 _("Missing new-line parsing variable names "
1623 "at offset %zu in MRSETS record."),
1627 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1629 var = dict_lookup_var (dict, var_name);
1635 if (!stringi_set_insert (&var_names, var_name))
1637 sys_warn (r, record->pos,
1638 _("Duplicate variable name %s "
1639 "at offset %zu in MRSETS record."),
1640 var_name, text_pos (text));
1646 if (mrset->label == NULL && mrset->label_from_var_label
1647 && var_has_label (var))
1648 mrset->label = xstrdup (var_get_label (var));
1651 && var_get_type (var) != var_get_type (mrset->vars[0]))
1653 sys_warn (r, record->pos,
1654 _("MRSET %s contains both string and "
1655 "numeric variables."), name);
1658 width = MIN (width, var_get_width (var));
1660 if (mrset->n_vars >= allocated_vars)
1661 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1662 sizeof *mrset->vars);
1663 mrset->vars[mrset->n_vars++] = var;
1665 while (delimiter != '\n');
1667 if (mrset->n_vars < 2)
1669 sys_warn (r, record->pos,
1670 _("MRSET %s has only %zu variables."), mrset->name,
1672 mrset_destroy (mrset);
1673 stringi_set_destroy (&var_names);
1677 if (mrset->type == MRSET_MD)
1679 mrset->width = width;
1680 value_init (&mrset->counted, width);
1682 mrset->counted.f = c_strtod (counted, NULL);
1684 value_copy_str_rpad (&mrset->counted, width,
1685 (const uint8_t *) counted, ' ');
1688 dict_add_mrset (dict, mrset);
1690 stringi_set_destroy (&var_names);
1692 mrset_destroy (mrset);
1693 close_text_record (r, text);
1696 /* Read record type 7, subtype 11, which specifies how variables
1697 should be displayed in GUI environments. */
1699 parse_display_parameters (struct sfm_reader *r,
1700 const struct sfm_extension_record *record,
1701 struct dictionary *dict)
1703 bool includes_width;
1704 bool warned = false;
1709 n_vars = dict_get_var_cnt (dict);
1710 if (record->count == 3 * n_vars)
1711 includes_width = true;
1712 else if (record->count == 2 * n_vars)
1713 includes_width = false;
1716 sys_warn (r, record->pos,
1717 _("Extension 11 has bad count %zu (for %zu variables)."),
1718 record->count, n_vars);
1723 for (i = 0; i < n_vars; ++i)
1725 struct variable *v = dict_get_var (dict, i);
1726 int measure, width, align;
1728 measure = parse_int (r, record->data, ofs);
1733 width = parse_int (r, record->data, ofs);
1739 align = parse_int (r, record->data, ofs);
1742 /* SPSS sometimes seems to set variables' measure to zero. */
1746 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1749 sys_warn (r, record->pos,
1750 _("Invalid variable display parameters for variable "
1751 "%zu (%s). Default parameters substituted."),
1752 i, var_get_name (v));
1757 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1758 : measure == 2 ? MEASURE_ORDINAL
1760 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1761 : align == 1 ? ALIGN_RIGHT
1764 /* Older versions (SPSS 9.0) sometimes set the display
1765 width to zero. This causes confusion in the GUI, so
1766 only set the width if it is nonzero. */
1768 var_set_display_width (v, width);
1773 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1774 const char *new_name)
1776 size_t n_short_names;
1780 /* Renaming a variable may clear its short names, but we
1781 want to retain them, so we save them and re-set them
1783 n_short_names = var_get_short_name_cnt (var);
1784 short_names = xnmalloc (n_short_names, sizeof *short_names);
1785 for (i = 0; i < n_short_names; i++)
1787 const char *s = var_get_short_name (var, i);
1788 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1791 /* Set long name. */
1792 dict_rename_var (dict, var, new_name);
1794 /* Restore short names. */
1795 for (i = 0; i < n_short_names; i++)
1797 var_set_short_name (var, i, short_names[i]);
1798 free (short_names[i]);
1803 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1804 to each short name. Modifies variable names in DICT accordingly. */
1806 parse_long_var_name_map (struct sfm_reader *r,
1807 const struct sfm_extension_record *record,
1808 struct dictionary *dict)
1810 struct text_record *text;
1811 struct variable *var;
1816 /* There are no long variable names. Use the short variable names,
1817 converted to lowercase, as the long variable names. */
1820 for (i = 0; i < dict_get_var_cnt (dict); i++)
1822 struct variable *var = dict_get_var (dict, i);
1825 new_name = utf8_to_lower (var_get_name (var));
1826 rename_var_and_save_short_names (dict, var, new_name);
1833 /* Rename each of the variables, one by one. (In a correctly constructed
1834 system file, this cannot create any intermediate duplicate variable names,
1835 because all of the new variable names are longer than any of the old
1836 variable names and thus there cannot be any overlaps.) */
1837 text = open_text_record (r, record, true);
1838 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1840 /* Validate long name. */
1841 if (!dict_id_is_valid (dict, long_name, false))
1843 sys_warn (r, record->pos,
1844 _("Long variable mapping from %s to invalid "
1845 "variable name `%s'."),
1846 var_get_name (var), long_name);
1850 /* Identify any duplicates. */
1851 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1852 && dict_lookup_var (dict, long_name) != NULL)
1854 sys_warn (r, record->pos,
1855 _("Duplicate long variable name `%s'."), long_name);
1859 rename_var_and_save_short_names (dict, var, long_name);
1861 close_text_record (r, text);
1864 /* Reads record type 7, subtype 14, which gives the real length
1865 of each very long string. Rearranges DICT accordingly. */
1867 parse_long_string_map (struct sfm_reader *r,
1868 const struct sfm_extension_record *record,
1869 struct dictionary *dict)
1871 struct text_record *text;
1872 struct variable *var;
1875 text = open_text_record (r, record, true);
1876 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1878 size_t idx = var_get_dict_index (var);
1884 length = strtol (length_s, NULL, 10);
1885 if (length < 1 || length > MAX_STRING)
1887 sys_warn (r, record->pos,
1888 _("%s listed as string of invalid length %s "
1889 "in very long string record."),
1890 var_get_name (var), length_s);
1894 /* Check segments. */
1895 segment_cnt = sfm_width_to_segments (length);
1896 if (segment_cnt == 1)
1898 sys_warn (r, record->pos,
1899 _("%s listed in very long string record with width %s, "
1900 "which requires only one segment."),
1901 var_get_name (var), length_s);
1904 if (idx + segment_cnt > dict_get_var_cnt (dict))
1906 sys_error (r, record->pos,
1907 _("Very long string %s overflows dictionary."),
1908 var_get_name (var));
1912 /* Get the short names from the segments and check their
1914 for (i = 0; i < segment_cnt; i++)
1916 struct variable *seg = dict_get_var (dict, idx + i);
1917 int alloc_width = sfm_segment_alloc_width (length, i);
1918 int width = var_get_width (seg);
1921 var_set_short_name (var, i, var_get_short_name (seg, 0));
1922 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1924 sys_error (r, record->pos,
1925 _("Very long string with width %ld has segment %d "
1926 "of width %d (expected %d)."),
1927 length, i, width, alloc_width);
1931 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1932 var_set_width (var, length);
1934 close_text_record (r, text);
1935 dict_compact_values (dict);
1941 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1942 const struct sfm_var_record *var_recs, size_t n_var_recs,
1943 const struct sfm_value_label_record *record)
1945 struct variable **vars;
1949 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
1950 for (i = 0; i < record->n_labels; i++)
1951 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1952 record->labels[i].label, -1,
1955 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1956 for (i = 0; i < record->n_vars; i++)
1958 vars[i] = lookup_var_by_index (r, record->pos,
1959 var_recs, n_var_recs, record->vars[i]);
1960 if (vars[i] == NULL)
1964 for (i = 1; i < record->n_vars; i++)
1965 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1967 sys_error (r, record->pos,
1968 _("Variables associated with value label are not all of "
1969 "identical type. Variable %s is %s, but variable "
1971 var_get_name (vars[0]),
1972 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1973 var_get_name (vars[i]),
1974 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1978 for (i = 0; i < record->n_vars; i++)
1980 struct variable *var = vars[i];
1984 width = var_get_width (var);
1987 sys_error (r, record->pos,
1988 _("Value labels may not be added to long string "
1989 "variables (e.g. %s) using records types 3 and 4."),
1990 var_get_name (var));
1994 for (j = 0; j < record->n_labels; j++)
1996 struct sfm_value_label *label = &record->labels[j];
1999 value_init (&value, width);
2001 value.f = parse_float (r, label->value, 0);
2003 memcpy (value_str_rw (&value, width), label->value, width);
2005 if (!var_add_value_label (var, &value, utf8_labels[j]))
2007 if (var_is_numeric (var))
2008 sys_warn (r, record->pos,
2009 _("Duplicate value label for %g on %s."),
2010 value.f, var_get_name (var));
2012 sys_warn (r, record->pos,
2013 _("Duplicate value label for `%.*s' on %s."),
2014 width, value_str (&value, width),
2015 var_get_name (var));
2018 value_destroy (&value, width);
2022 pool_free (r->pool, vars);
2023 for (i = 0; i < record->n_labels; i++)
2024 pool_free (r->pool, utf8_labels[i]);
2025 pool_free (r->pool, utf8_labels);
2030 static struct variable *
2031 lookup_var_by_index (struct sfm_reader *r, off_t offset,
2032 const struct sfm_var_record *var_recs, size_t n_var_recs,
2035 const struct sfm_var_record *rec;
2037 if (idx < 1 || idx > n_var_recs)
2039 sys_error (r, offset,
2040 _("Variable index %d not in valid range 1...%zu."),
2045 rec = &var_recs[idx - 1];
2046 if (rec->var == NULL)
2048 sys_error (r, offset,
2049 _("Variable index %d refers to long string continuation."),
2057 /* Parses a set of custom attributes from TEXT into ATTRS.
2058 ATTRS may be a null pointer, in which case the attributes are
2059 read but discarded. */
2061 parse_attributes (struct sfm_reader *r, struct text_record *text,
2062 struct attrset *attrs)
2066 struct attribute *attr;
2070 /* Parse the key. */
2071 key = text_get_token (text, ss_cstr ("("), NULL);
2075 attr = attribute_create (key);
2076 for (index = 1; ; index++)
2078 /* Parse the value. */
2082 value = text_get_token (text, ss_cstr ("\n"), NULL);
2085 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2090 length = strlen (value);
2091 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2093 value[length - 1] = '\0';
2094 attribute_add_value (attr, value + 1);
2099 _("Attribute value %s[%d] is not quoted: %s."),
2101 attribute_add_value (attr, value);
2104 /* Was this the last value for this attribute? */
2105 if (text_match (text, ')'))
2109 attrset_add (attrs, attr);
2111 attribute_destroy (attr);
2113 while (!text_match (text, '/'));
2116 /* Reads record type 7, subtype 17, which lists custom
2117 attributes on the data file. */
2119 parse_data_file_attributes (struct sfm_reader *r,
2120 const struct sfm_extension_record *record,
2121 struct dictionary *dict)
2123 struct text_record *text = open_text_record (r, record, true);
2124 parse_attributes (r, text, dict_get_attributes (dict));
2125 close_text_record (r, text);
2128 /* Parses record type 7, subtype 18, which lists custom
2129 attributes on individual variables. */
2131 parse_variable_attributes (struct sfm_reader *r,
2132 const struct sfm_extension_record *record,
2133 struct dictionary *dict)
2135 struct text_record *text;
2136 struct variable *var;
2138 text = open_text_record (r, record, true);
2139 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2140 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2141 close_text_record (r, text);
2145 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2147 size_t n_warnings = 0;
2150 for (i = 0; i < dict_get_var_cnt (dict); i++)
2152 struct variable *var = dict_get_var (dict, i);
2153 struct attrset *attrs = var_get_attributes (var);
2154 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2157 int value = atoi (attribute_get_value (attr, 0));
2179 role = ROLE_PARTITION;
2188 if (n_warnings++ == 0)
2189 sys_warn (r, -1, _("Invalid role for variable %s."),
2190 var_get_name (var));
2193 var_set_role (var, role);
2198 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2203 check_overflow (struct sfm_reader *r,
2204 const struct sfm_extension_record *record,
2205 size_t ofs, size_t length)
2207 size_t end = record->size * record->count;
2208 if (length >= end || ofs + length > end)
2210 sys_error (r, record->pos + end,
2211 _("Extension record subtype %d ends unexpectedly."),
2219 parse_long_string_value_labels (struct sfm_reader *r,
2220 const struct sfm_extension_record *record,
2221 struct dictionary *dict)
2223 const char *dict_encoding = dict_get_encoding (dict);
2224 size_t end = record->size * record->count;
2231 struct variable *var;
2236 /* Parse variable name length. */
2237 if (!check_overflow (r, record, ofs, 4))
2239 var_name_len = parse_int (r, record->data, ofs);
2242 /* Parse variable name, width, and number of labels. */
2243 if (!check_overflow (r, record, ofs, var_name_len + 8))
2245 var_name = recode_string_pool ("UTF-8", dict_encoding,
2246 (const char *) record->data + ofs,
2247 var_name_len, r->pool);
2248 width = parse_int (r, record->data, ofs + var_name_len);
2249 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2250 ofs += var_name_len + 8;
2252 /* Look up 'var' and validate. */
2253 var = dict_lookup_var (dict, var_name);
2255 sys_warn (r, record->pos + ofs,
2256 _("Ignoring long string value label record for "
2257 "unknown variable %s."), var_name);
2258 else if (var_is_numeric (var))
2260 sys_warn (r, record->pos + ofs,
2261 _("Ignoring long string value label record for "
2262 "numeric variable %s."), var_name);
2265 else if (width != var_get_width (var))
2267 sys_warn (r, record->pos + ofs,
2268 _("Ignoring long string value label record for variable "
2269 "%s because the record's width (%d) does not match the "
2270 "variable's width (%d)."),
2271 var_name, width, var_get_width (var));
2276 value_init_pool (r->pool, &value, width);
2277 for (i = 0; i < n_labels; i++)
2279 size_t value_length, label_length;
2280 bool skip = var == NULL;
2282 /* Parse value length. */
2283 if (!check_overflow (r, record, ofs, 4))
2285 value_length = parse_int (r, record->data, ofs);
2289 if (!check_overflow (r, record, ofs, value_length))
2293 if (value_length == width)
2294 memcpy (value_str_rw (&value, width),
2295 (const uint8_t *) record->data + ofs, width);
2298 sys_warn (r, record->pos + ofs,
2299 _("Ignoring long string value label %zu for "
2300 "variable %s, with width %d, that has bad value "
2302 i, var_get_name (var), width, value_length);
2306 ofs += value_length;
2308 /* Parse label length. */
2309 if (!check_overflow (r, record, ofs, 4))
2311 label_length = parse_int (r, record->data, ofs);
2315 if (!check_overflow (r, record, ofs, label_length))
2321 label = recode_string_pool ("UTF-8", dict_encoding,
2322 (const char *) record->data + ofs,
2323 label_length, r->pool);
2324 if (!var_add_value_label (var, &value, label))
2325 sys_warn (r, record->pos + ofs,
2326 _("Duplicate value label for `%.*s' on %s."),
2327 width, value_str (&value, width),
2328 var_get_name (var));
2329 pool_free (r->pool, label);
2331 ofs += label_length;
2339 parse_long_string_missing_values (struct sfm_reader *r,
2340 const struct sfm_extension_record *record,
2341 struct dictionary *dict)
2343 const char *dict_encoding = dict_get_encoding (dict);
2344 size_t end = record->size * record->count;
2349 struct missing_values mv;
2351 struct variable *var;
2352 int n_missing_values;
2356 /* Parse variable name length. */
2357 if (!check_overflow (r, record, ofs, 4))
2359 var_name_len = parse_int (r, record->data, ofs);
2362 /* Parse variable name. */
2363 if (!check_overflow (r, record, ofs, var_name_len + 1))
2365 var_name = recode_string_pool ("UTF-8", dict_encoding,
2366 (const char *) record->data + ofs,
2367 var_name_len, r->pool);
2368 ofs += var_name_len;
2370 /* Parse number of missing values. */
2371 n_missing_values = ((const uint8_t *) record->data)[ofs];
2372 if (n_missing_values < 1 || n_missing_values > 3)
2373 sys_warn (r, record->pos + ofs,
2374 _("Long string missing values record says variable %s "
2375 "has %d missing values, but only 1 to 3 missing values "
2377 var_name, n_missing_values);
2380 /* Look up 'var' and validate. */
2381 var = dict_lookup_var (dict, var_name);
2383 sys_warn (r, record->pos + ofs,
2384 _("Ignoring long string missing value record for "
2385 "unknown variable %s."), var_name);
2386 else if (var_is_numeric (var))
2388 sys_warn (r, record->pos + ofs,
2389 _("Ignoring long string missing value record for "
2390 "numeric variable %s."), var_name);
2395 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2396 for (i = 0; i < n_missing_values; i++)
2398 size_t value_length;
2400 /* Parse value length. */
2401 if (!check_overflow (r, record, ofs, 4))
2403 value_length = parse_int (r, record->data, ofs);
2407 if (!check_overflow (r, record, ofs, value_length))
2411 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2413 sys_warn (r, record->pos + ofs,
2414 _("Ignoring long string missing value %zu for variable "
2415 "%s, with width %d, that has bad value width %zu."),
2416 i, var_get_name (var), var_get_width (var),
2418 ofs += value_length;
2421 var_set_missing_values (var, &mv);
2429 static void partial_record (struct sfm_reader *);
2431 static void read_error (struct casereader *, const struct sfm_reader *);
2433 static bool read_case_number (struct sfm_reader *, double *);
2434 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2435 static int read_opcode (struct sfm_reader *);
2436 static bool read_compressed_number (struct sfm_reader *, double *);
2437 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2438 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2439 static bool skip_whole_strings (struct sfm_reader *, size_t);
2441 /* Reads and returns one case from READER's file. Returns a null
2442 pointer if not successful. */
2443 static struct ccase *
2444 sys_file_casereader_read (struct casereader *reader, void *r_)
2446 struct sfm_reader *r = r_;
2454 c = case_create (r->proto);
2456 for (i = 0; i < r->sfm_var_cnt; i++)
2458 struct sfm_var *sv = &r->sfm_vars[i];
2459 union value *v = case_data_rw_idx (c, sv->case_index);
2461 if (sv->var_width == 0)
2462 retval = read_case_number (r, &v->f);
2465 uint8_t *s = value_str_rw (v, sv->var_width);
2466 retval = read_case_string (r, s + sv->offset, sv->segment_width);
2469 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2471 sys_error (r, r->pos, _("File ends in partial string value."));
2483 if (r->case_cnt != -1)
2484 read_error (reader, r);
2489 /* Issues an error that R ends in a partial record. */
2491 partial_record (struct sfm_reader *r)
2493 sys_error (r, r->pos, _("File ends in partial case."));
2496 /* Issues an error that an unspecified error occurred SFM, and
2499 read_error (struct casereader *r, const struct sfm_reader *sfm)
2501 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2502 casereader_force_error (r);
2505 /* Reads a number from R and stores its value in *D.
2506 If R is compressed, reads a compressed number;
2507 otherwise, reads a number in the regular way.
2508 Returns true if successful, false if end of file is
2509 reached immediately. */
2511 read_case_number (struct sfm_reader *r, double *d)
2513 if (r->compression == SFM_COMP_NONE)
2516 if (!try_read_bytes (r, number, sizeof number))
2518 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2522 return read_compressed_number (r, d);
2525 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2526 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2527 discarded without being written to S. Reads compressed strings if S is
2528 compressed. Returns 1 if successful, 0 if end of file is reached
2529 immediately, or -1 for some kind of error. */
2531 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2533 size_t whole = ROUND_DOWN (length, 8);
2534 size_t partial = length % 8;
2538 int retval = read_whole_strings (r, s, whole);
2546 int retval = read_whole_strings (r, bounce, sizeof bounce);
2558 memcpy (s + whole, bounce, partial);
2564 /* Reads and returns the next compression opcode from R. */
2566 read_opcode (struct sfm_reader *r)
2568 assert (r->compression != SFM_COMP_NONE);
2572 if (r->opcode_idx >= sizeof r->opcodes)
2575 int retval = try_read_compressed_bytes (r, r->opcodes,
2581 opcode = r->opcodes[r->opcode_idx++];
2588 /* Reads a compressed number from R and stores its value in D.
2589 Returns true if successful, false if end of file is
2590 reached immediately. */
2592 read_compressed_number (struct sfm_reader *r, double *d)
2594 int opcode = read_opcode (r);
2602 return read_compressed_float (r, d);
2605 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2606 if (!r->corruption_warning)
2608 r->corruption_warning = true;
2609 sys_warn (r, r->pos,
2610 _("Possible compressed data corruption: "
2611 "compressed spaces appear in numeric field."));
2620 *d = opcode - r->bias;
2627 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2629 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2634 opcode = read_opcode (r);
2642 retval = read_compressed_bytes (r, dst, 8);
2643 return retval == 1 ? 1 : -1;
2646 memset (dst, ' ', 8);
2651 double value = opcode - r->bias;
2652 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2655 /* This has actually been seen "in the wild". The submitter of the
2656 file that showed that the contents decoded as spaces, but they
2657 were at the end of the field so it's possible that the null
2658 bytes just acted as null terminators. */
2660 else if (!r->corruption_warning)
2662 r->corruption_warning = true;
2663 sys_warn (r, r->pos,
2664 _("Possible compressed data corruption: "
2665 "string contains compressed integer (opcode %d)."),
2673 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2674 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2675 end of file is reached immediately, or -1 for some kind of error. */
2677 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2679 assert (length % 8 == 0);
2680 if (r->compression == SFM_COMP_NONE)
2681 return try_read_bytes (r, s, length);
2686 for (ofs = 0; ofs < length; ofs += 8)
2688 int retval = read_compressed_string (r, s + ofs);
2703 /* Skips LENGTH string bytes from R.
2704 LENGTH must be a multiple of 8.
2705 (LENGTH is also limited to 1024, but that's only because the
2706 current caller never needs more than that many bytes.)
2707 Returns true if successful, false if end of file is
2708 reached immediately. */
2710 skip_whole_strings (struct sfm_reader *r, size_t length)
2712 uint8_t buffer[1024];
2713 assert (length < sizeof buffer);
2714 return read_whole_strings (r, buffer, length);
2717 /* Helpers for reading records that contain structured text
2720 /* Maximum number of warnings to issue for a single text
2722 #define MAX_TEXT_WARNINGS 5
2727 struct substring buffer; /* Record contents. */
2728 off_t start; /* Starting offset in file. */
2729 size_t pos; /* Current position in buffer. */
2730 int n_warnings; /* Number of warnings issued or suppressed. */
2731 bool recoded; /* Recoded into UTF-8? */
2734 static struct text_record *
2735 open_text_record (struct sfm_reader *r,
2736 const struct sfm_extension_record *record,
2737 bool recode_to_utf8)
2739 struct text_record *text;
2740 struct substring raw;
2742 text = pool_alloc (r->pool, sizeof *text);
2743 raw = ss_buffer (record->data, record->size * record->count);
2744 text->start = record->pos;
2745 text->buffer = (recode_to_utf8
2746 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2749 text->n_warnings = 0;
2750 text->recoded = recode_to_utf8;
2755 /* Closes TEXT, frees its storage, and issues a final warning
2756 about suppressed warnings if necesary. */
2758 close_text_record (struct sfm_reader *r, struct text_record *text)
2760 if (text->n_warnings > MAX_TEXT_WARNINGS)
2761 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2762 text->n_warnings - MAX_TEXT_WARNINGS);
2764 pool_free (r->pool, ss_data (text->buffer));
2767 /* Reads a variable=value pair from TEXT.
2768 Looks up the variable in DICT and stores it into *VAR.
2769 Stores a null-terminated value into *VALUE. */
2771 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2772 struct text_record *text,
2773 struct variable **var, char **value)
2777 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2780 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2784 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2785 ss_buffer ("\t\0", 2));
2793 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2794 struct text_record *text, struct substring delimiters,
2795 struct variable **var)
2799 name = text_get_token (text, delimiters, NULL);
2803 *var = dict_lookup_var (dict, name);
2807 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2814 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2815 struct text_record *text, struct substring delimiters,
2816 struct variable **var)
2818 char *short_name = text_get_token (text, delimiters, NULL);
2819 if (short_name == NULL)
2822 *var = dict_lookup_var (dict, short_name);
2824 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2829 /* Displays a warning for the current file position, limiting the
2830 number to MAX_TEXT_WARNINGS for TEXT. */
2832 text_warn (struct sfm_reader *r, struct text_record *text,
2833 const char *format, ...)
2835 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2839 va_start (args, format);
2840 sys_msg (r, text->start + text->pos, MW, format, args);
2846 text_get_token (struct text_record *text, struct substring delimiters,
2849 struct substring token;
2852 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2855 end = &ss_data (token)[ss_length (token)];
2856 if (delimiter != NULL)
2859 return ss_data (token);
2862 /* Reads a integer value expressed in decimal, then a space, then a string that
2863 consists of exactly as many bytes as specified by the integer, then a space,
2864 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2865 buffer (so the caller should not free the string). */
2867 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2875 while (text->pos < text->buffer.length)
2877 int c = text->buffer.string[text->pos];
2878 if (c < '0' || c > '9')
2880 n = (n * 10) + (c - '0');
2883 if (text->pos >= text->buffer.length || start == text->pos)
2885 sys_warn (r, text->start,
2886 _("Expecting digit at offset %zu in MRSETS record."),
2891 if (!text_match (text, ' '))
2893 sys_warn (r, text->start,
2894 _("Expecting space at offset %zu in MRSETS record."),
2899 if (text->pos + n > text->buffer.length)
2901 sys_warn (r, text->start,
2902 _("%zu-byte string starting at offset %zu "
2903 "exceeds record length %zu."),
2904 n, text->pos, text->buffer.length);
2908 s = &text->buffer.string[text->pos];
2911 sys_warn (r, text->start,
2912 _("Expecting space at offset %zu following %zu-byte string."),
2922 text_match (struct text_record *text, char c)
2924 if (text->buffer.string[text->pos] == c)
2933 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2934 inside the TEXT's string. */
2936 text_pos (const struct text_record *text)
2942 text_get_all (const struct text_record *text)
2944 return text->buffer.string;
2949 /* Displays a corruption message. */
2951 sys_msg (struct sfm_reader *r, off_t offset,
2952 int class, const char *format, va_list args)
2957 ds_init_empty (&text);
2959 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2960 fh_get_file_name (r->fh), (long long int) offset);
2962 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2963 ds_put_vformat (&text, format, args);
2965 m.category = msg_class_to_category (class);
2966 m.severity = msg_class_to_severity (class);
2972 m.text = ds_cstr (&text);
2977 /* Displays a warning for offset OFFSET in the file. */
2979 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2983 va_start (args, format);
2984 sys_msg (r, offset, MW, format, args);
2988 /* Displays an error for the current file position,
2989 marks it as in an error state,
2990 and aborts reading it using longjmp. */
2992 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2996 va_start (args, format);
2997 sys_msg (r, offset, ME, format, args);
3003 /* Reads BYTE_CNT bytes into BUF.
3004 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3005 Returns -1 if an I/O error or a partial read occurs.
3006 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3009 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3010 void *buf, size_t byte_cnt)
3012 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3013 r->pos += bytes_read;
3014 if (bytes_read == byte_cnt)
3016 else if (ferror (r->file))
3018 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3021 else if (!eof_is_ok || bytes_read != 0)
3023 sys_error (r, r->pos, _("Unexpected end of file."));
3030 /* Reads BYTE_CNT into BUF.
3031 Returns true if successful.
3032 Returns false upon I/O error or if end-of-file is encountered. */
3034 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3036 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3039 /* Reads BYTE_CNT bytes into BUF.
3040 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3041 Returns 0 if an immediate end-of-file is encountered.
3042 Returns -1 if an I/O error or a partial read occurs. */
3044 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3046 return read_bytes_internal (r, true, buf, byte_cnt);
3049 /* Reads a 32-bit signed integer from R and stores its value in host format in
3050 *X. Returns true if successful, otherwise false. */
3052 read_int (struct sfm_reader *r, int *x)
3055 if (read_bytes (r, integer, sizeof integer) != 1)
3057 *x = integer_get (r->integer_format, integer, sizeof integer);
3062 read_uint (struct sfm_reader *r, unsigned int *x)
3067 ok = read_int (r, &y);
3072 /* Reads a 64-bit signed integer from R and returns its value in
3075 read_int64 (struct sfm_reader *r, long long int *x)
3078 if (read_bytes (r, integer, sizeof integer) != 1)
3080 *x = integer_get (r->integer_format, integer, sizeof integer);
3084 /* Reads a 64-bit signed integer from R and returns its value in
3087 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3092 ok = read_int64 (r, &y);
3098 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
3100 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3104 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
3106 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3109 /* Reads exactly SIZE - 1 bytes into BUFFER
3110 and stores a null byte into BUFFER[SIZE - 1]. */
3112 read_string (struct sfm_reader *r, char *buffer, size_t size)
3117 ok = read_bytes (r, buffer, size - 1);
3119 buffer[size - 1] = '\0';
3123 /* Skips BYTES bytes forward in R. */
3125 skip_bytes (struct sfm_reader *r, size_t bytes)
3130 size_t chunk = MIN (sizeof buffer, bytes);
3131 if (!read_bytes (r, buffer, chunk))
3139 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3140 been replaced by LFs.
3142 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3143 files that use CR-only line ends in the file label and extra product
3146 fix_line_ends (const char *s)
3150 d = dst = xmalloc (strlen (s) + 1);
3169 read_ztrailer (struct sfm_reader *r,
3170 long long int zheader_ofs,
3171 long long int ztrailer_len);
3174 zalloc (voidpf pool_, uInt items, uInt size)
3176 struct pool *pool = pool_;
3178 return (!size || xalloc_oversized (items, size)
3180 : pool_malloc (pool, items * size));
3184 zfree (voidpf pool_, voidpf address)
3186 struct pool *pool = pool_;
3188 pool_free (pool, address);
3192 read_zheader (struct sfm_reader *r)
3195 long long int zheader_ofs;
3196 long long int ztrailer_ofs;
3197 long long int ztrailer_len;
3199 if (!read_int64 (r, &zheader_ofs)
3200 || !read_int64 (r, &ztrailer_ofs)
3201 || !read_int64 (r, &ztrailer_len))
3204 if (zheader_ofs != pos)
3206 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3207 "(expected %#llx)."),
3208 zheader_ofs, (long long int) pos);
3212 if (ztrailer_ofs < r->pos)
3214 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3219 if (ztrailer_len < 24 || ztrailer_len % 24)
3221 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3225 r->ztrailer_ofs = ztrailer_ofs;
3226 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3229 if (r->zin_buf == NULL)
3231 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3232 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3233 r->zstream.next_in = NULL;
3234 r->zstream.avail_in = 0;
3237 r->zstream.zalloc = zalloc;
3238 r->zstream.zfree = zfree;
3239 r->zstream.opaque = r->pool;
3241 return open_zstream (r);
3245 seek (struct sfm_reader *r, off_t offset)
3247 if (fseeko (r->file, offset, SEEK_SET))
3248 sys_error (r, 0, _("%s: seek failed (%s)."),
3249 fh_get_file_name (r->fh), strerror (errno));
3253 /* Performs some additional consistency checks on the ZLIB compressed data
3256 read_ztrailer (struct sfm_reader *r,
3257 long long int zheader_ofs,
3258 long long int ztrailer_len)
3260 long long int expected_uncmp_ofs;
3261 long long int expected_cmp_ofs;
3264 unsigned int block_size;
3265 unsigned int n_blocks;
3269 if (fstat (fileno (r->file), &s))
3271 sys_error (ME, 0, _("%s: stat failed (%s)."),
3272 fh_get_file_name (r->fh), strerror (errno));
3276 if (!S_ISREG (s.st_mode))
3278 /* We can't seek to the trailer and then back to the data in this file,
3279 so skip doing extra checks. */
3283 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3284 sys_warn (r, r->pos,
3285 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3286 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3288 seek (r, r->ztrailer_ofs);
3290 /* Read fixed header from ZLIB data trailer. */
3291 if (!read_int64 (r, &bias))
3293 if (-bias != r->bias)
3295 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3296 "file header bias (%.2f)."),
3301 if (!read_int64 (r, &zero))
3304 sys_warn (r, r->pos,
3305 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3307 if (!read_uint (r, &block_size))
3309 if (block_size != ZBLOCK_SIZE)
3310 sys_warn (r, r->pos,
3311 _("ZLIB trailer specifies unexpected %u-byte block size."),
3314 if (!read_uint (r, &n_blocks))
3316 if (n_blocks != (ztrailer_len - 24) / 24)
3318 sys_error (r, r->pos,
3319 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3321 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3325 expected_uncmp_ofs = zheader_ofs;
3326 expected_cmp_ofs = zheader_ofs + 24;
3327 for (i = 0; i < n_blocks; i++)
3329 off_t desc_ofs = r->pos;
3330 unsigned long long int uncompressed_ofs;
3331 unsigned long long int compressed_ofs;
3332 unsigned int uncompressed_size;
3333 unsigned int compressed_size;
3335 if (!read_uint64 (r, &uncompressed_ofs)
3336 || !read_uint64 (r, &compressed_ofs)
3337 || !read_uint (r, &uncompressed_size)
3338 || !read_uint (r, &compressed_size))
3341 if (uncompressed_ofs != expected_uncmp_ofs)
3343 sys_error (r, desc_ofs,
3344 _("ZLIB block descriptor %u reported uncompressed data "
3345 "offset %#llx, when %#llx was expected."),
3346 i, uncompressed_ofs, expected_uncmp_ofs);
3350 if (compressed_ofs != expected_cmp_ofs)
3352 sys_error (r, desc_ofs,
3353 _("ZLIB block descriptor %u reported compressed data "
3354 "offset %#llx, when %#llx was expected."),
3355 i, compressed_ofs, expected_cmp_ofs);
3359 if (i < n_blocks - 1)
3361 if (uncompressed_size != block_size)
3362 sys_warn (r, desc_ofs,
3363 _("ZLIB block descriptor %u reported block size %#x, "
3364 "when %#x was expected."),
3365 i, uncompressed_size, block_size);
3369 if (uncompressed_size > block_size)
3370 sys_warn (r, desc_ofs,
3371 _("ZLIB block descriptor %u reported block size %#x, "
3372 "when at most %#x was expected."),
3373 i, uncompressed_size, block_size);
3376 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3377 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3378 This code checks for an expansion of more than 14.3% plus 11
3380 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3382 sys_error (r, desc_ofs,
3383 _("ZLIB block descriptor %u reports compressed size %u "
3384 "and uncompressed size %u."),
3385 i, compressed_size, uncompressed_size);
3389 expected_uncmp_ofs += uncompressed_size;
3390 expected_cmp_ofs += compressed_size;
3393 if (expected_cmp_ofs != r->ztrailer_ofs)
3395 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3396 "would be expected from block descriptors."),
3397 r->ztrailer_ofs, expected_cmp_ofs);
3401 seek (r, zheader_ofs + 24);
3406 open_zstream (struct sfm_reader *r)
3410 r->zout_pos = r->zout_end = 0;
3411 error = inflateInit (&r->zstream);
3414 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3422 close_zstream (struct sfm_reader *r)
3426 error = inflateEnd (&r->zstream);
3429 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3437 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3439 uint8_t *buf = buf_;
3448 /* Use already inflated data if there is any. */
3449 if (r->zout_pos < r->zout_end)
3451 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3452 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3461 /* We need to inflate some more data.
3462 Get some more input data if we don't have any. */
3463 if (r->zstream.avail_in == 0)
3465 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3470 int retval = try_read_bytes (r, r->zin_buf, n);
3473 r->zstream.avail_in = n;
3474 r->zstream.next_in = r->zin_buf;
3478 /* Inflate the (remaining) input data. */
3479 r->zstream.avail_out = ZOUT_BUF_SIZE;
3480 r->zstream.next_out = r->zout_buf;
3481 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3483 r->zout_end = r->zstream.next_out - r->zout_buf;
3484 if (r->zout_end == 0)
3486 if (error != Z_STREAM_END)
3488 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3492 else if (!close_zstream (r) || !open_zstream (r))
3497 /* Process the output data and ignore 'error' for now. ZLIB will
3498 present it to us again on the next inflate() call. */
3504 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3506 if (r->compression == SFM_COMP_SIMPLE)
3507 return read_bytes (r, buf, byte_cnt);
3510 int retval = read_bytes_zlib (r, buf, byte_cnt);
3512 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3518 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3520 if (r->compression == SFM_COMP_SIMPLE)
3521 return try_read_bytes (r, buf, byte_cnt);
3523 return read_bytes_zlib (r, buf, byte_cnt);
3526 /* Reads a 64-bit floating-point number from R and returns its
3527 value in host format. */
3529 read_compressed_float (struct sfm_reader *r, double *d)
3533 if (!read_compressed_bytes (r, number, sizeof number))
3536 *d = float_get_double (r->float_format, number);
3540 static const struct casereader_class sys_file_casereader_class =
3542 sys_file_casereader_read,
3543 sys_file_casereader_destroy,