1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/message.h"
49 #include "libpspp/misc.h"
50 #include "libpspp/pool.h"
51 #include "libpspp/str.h"
52 #include "libpspp/stringi-set.h"
54 #include "gl/c-strtod.h"
55 #include "gl/c-ctype.h"
56 #include "gl/inttostr.h"
57 #include "gl/localcharset.h"
58 #include "gl/minmax.h"
59 #include "gl/unlocked-io.h"
60 #include "gl/xalloc.h"
61 #include "gl/xalloc-oversized.h"
65 #define _(msgid) gettext (msgid)
66 #define N_(msgid) (msgid)
70 /* subtypes 0-2 unknown */
71 EXT_INTEGER = 3, /* Machine integer info. */
72 EXT_FLOAT = 4, /* Machine floating-point info. */
73 EXT_VAR_SETS = 5, /* Variable sets. */
74 EXT_DATE = 6, /* DATE. */
75 EXT_MRSETS = 7, /* Multiple response sets. */
76 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
77 /* subtype 9 unknown */
78 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
79 EXT_DISPLAY = 11, /* Variable display parameters. */
80 /* subtype 12 unknown */
81 EXT_LONG_NAMES = 13, /* Long variable names. */
82 EXT_LONG_STRINGS = 14, /* Long strings. */
83 /* subtype 15 unknown */
84 EXT_NCASES = 16, /* Extended number of cases. */
85 EXT_FILE_ATTRS = 17, /* Data file attributes. */
86 EXT_VAR_ATTRS = 18, /* Variable attributes. */
87 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
88 EXT_ENCODING = 20, /* Character encoding. */
89 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
90 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
91 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
94 /* Fields from the top-level header record. */
95 struct sfm_header_record
97 char magic[5]; /* First 4 bytes of file, then null. */
98 int weight_idx; /* 0 if unweighted, otherwise a var index. */
99 int nominal_case_size; /* Number of var positions. */
101 /* These correspond to the members of struct sfm_file_info or a dictionary
102 but in the system file's encoding rather than ASCII. */
103 char creation_date[10]; /* "dd mmm yy". */
104 char creation_time[9]; /* "hh:mm:ss". */
105 char eye_catcher[61]; /* Eye-catcher string, then product name. */
106 char file_label[65]; /* File label. */
109 struct sfm_var_record
116 int missing_value_code;
119 struct variable *var;
122 struct sfm_value_label
128 struct sfm_value_label_record
131 struct sfm_value_label *labels;
132 unsigned int n_labels;
138 struct sfm_document_record
145 struct sfm_extension_record
147 int subtype; /* Record subtype. */
148 off_t pos; /* Starting offset in file. */
149 size_t size; /* Size of data elements. */
150 size_t count; /* Number of data elements. */
151 void *data; /* Contents. */
154 /* System file reader. */
157 /* Resource tracking. */
158 struct pool *pool; /* All system file state. */
161 struct sfm_read_info info;
162 struct sfm_header_record header;
163 struct sfm_var_record *vars;
165 struct sfm_value_label_record *labels;
167 struct sfm_document_record *document;
168 struct sfm_extension_record *extensions[32];
171 struct file_handle *fh; /* File handle. */
172 struct fh_lock *lock; /* Mutual exclusion for file handle. */
173 FILE *file; /* File stream. */
174 off_t pos; /* Position in file. */
175 bool error; /* I/O or corruption error? */
176 struct caseproto *proto; /* Format of output cases. */
179 enum integer_format integer_format; /* On-disk integer format. */
180 enum float_format float_format; /* On-disk floating point format. */
181 struct sfm_var *sfm_vars; /* Variables. */
182 size_t sfm_var_cnt; /* Number of variables. */
183 int case_cnt; /* Number of cases */
184 const char *encoding; /* String encoding. */
187 enum sfm_compression compression;
188 double bias; /* Compression bias, usually 100.0. */
189 uint8_t opcodes[8]; /* Current block of opcodes. */
190 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
191 bool corruption_warning; /* Warned about possible corruption? */
193 /* ZLIB decompression. */
194 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
195 #define ZIN_BUF_SIZE 4096
196 uint8_t *zin_buf; /* Inflation input buffer. */
197 #define ZOUT_BUF_SIZE 16384
198 uint8_t *zout_buf; /* Inflation output buffer. */
199 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
200 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
201 z_stream zstream; /* ZLIB inflater. */
204 static const struct casereader_class sys_file_casereader_class;
206 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
207 const struct sfm_var_record *,
210 static void sys_msg (struct sfm_reader *r, off_t, int class,
211 const char *format, va_list args)
212 PRINTF_FORMAT (4, 0);
213 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
214 PRINTF_FORMAT (3, 4);
215 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
216 PRINTF_FORMAT (3, 4);
218 static bool read_bytes (struct sfm_reader *, void *, size_t)
220 static int try_read_bytes (struct sfm_reader *, void *, size_t)
222 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
223 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
224 static bool read_int64 (struct sfm_reader *, long long int *)
226 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
228 static bool read_string (struct sfm_reader *, char *, size_t)
230 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
232 /* ZLIB compressed data handling. */
233 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
234 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
235 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
236 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
238 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
240 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
242 static bool read_compressed_float (struct sfm_reader *, double *)
245 static char *fix_line_ends (const char *);
247 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
248 static double parse_float (const struct sfm_reader *,
249 const void *data, size_t ofs);
251 static bool read_variable_record (struct sfm_reader *,
252 struct sfm_var_record *);
253 static bool read_value_label_record (struct sfm_reader *,
254 struct sfm_value_label_record *);
255 static struct sfm_document_record *read_document_record (struct sfm_reader *);
256 static bool read_extension_record (struct sfm_reader *, int subtype,
257 struct sfm_extension_record **);
258 static bool skip_extension_record (struct sfm_reader *, int subtype);
260 static struct text_record *open_text_record (
261 struct sfm_reader *, const struct sfm_extension_record *,
262 bool recode_to_utf8);
263 static void close_text_record (struct sfm_reader *,
264 struct text_record *);
265 static bool read_variable_to_value_pair (struct sfm_reader *,
267 struct text_record *,
268 struct variable **var, char **value);
269 static void text_warn (struct sfm_reader *r, struct text_record *text,
270 const char *format, ...)
271 PRINTF_FORMAT (3, 4);
272 static char *text_get_token (struct text_record *,
273 struct substring delimiters, char *delimiter);
274 static bool text_match (struct text_record *, char c);
275 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
276 struct text_record *,
277 struct substring delimiters,
279 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
280 struct text_record *,
281 struct substring delimiters,
283 static const char *text_parse_counted_string (struct sfm_reader *,
284 struct text_record *);
285 static size_t text_pos (const struct text_record *);
286 static const char *text_get_all (const struct text_record *);
288 /* Dictionary reader. */
296 static bool read_dictionary (struct sfm_reader *);
297 static bool read_record (struct sfm_reader *, int type,
298 size_t *allocated_vars, size_t *allocated_labels);
299 static bool read_header (struct sfm_reader *, struct sfm_read_info *,
300 struct sfm_header_record *);
301 static void parse_header (struct sfm_reader *,
302 const struct sfm_header_record *,
303 struct sfm_read_info *, struct dictionary *);
304 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
305 struct sfm_var_record *, size_t n);
306 static void parse_format_spec (struct sfm_reader *, off_t pos,
307 unsigned int format, enum which_format,
308 struct variable *, int *format_warning_cnt);
309 static void parse_document (struct dictionary *, struct sfm_document_record *);
310 static void parse_display_parameters (struct sfm_reader *,
311 const struct sfm_extension_record *,
312 struct dictionary *);
313 static bool parse_machine_integer_info (struct sfm_reader *,
314 const struct sfm_extension_record *,
315 struct sfm_read_info *);
316 static void parse_machine_float_info (struct sfm_reader *,
317 const struct sfm_extension_record *);
318 static void parse_extra_product_info (struct sfm_reader *,
319 const struct sfm_extension_record *,
320 struct sfm_read_info *);
321 static void parse_mrsets (struct sfm_reader *,
322 const struct sfm_extension_record *,
323 struct dictionary *);
324 static void parse_long_var_name_map (struct sfm_reader *,
325 const struct sfm_extension_record *,
326 struct dictionary *);
327 static bool parse_long_string_map (struct sfm_reader *,
328 const struct sfm_extension_record *,
329 struct dictionary *);
330 static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
331 const struct sfm_var_record *,
333 const struct sfm_value_label_record *);
334 static void parse_data_file_attributes (struct sfm_reader *,
335 const struct sfm_extension_record *,
336 struct dictionary *);
337 static void parse_variable_attributes (struct sfm_reader *,
338 const struct sfm_extension_record *,
339 struct dictionary *);
340 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
341 static bool parse_long_string_value_labels (struct sfm_reader *,
342 const struct sfm_extension_record *,
343 struct dictionary *);
344 static bool parse_long_string_missing_values (
345 struct sfm_reader *, const struct sfm_extension_record *,
346 struct dictionary *);
348 /* Frees the strings inside INFO. */
350 sfm_read_info_destroy (struct sfm_read_info *info)
354 free (info->creation_date);
355 free (info->creation_time);
356 free (info->product);
357 free (info->product_ext);
361 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
362 successful, otherwise NULL. */
364 sfm_open (struct file_handle *fh)
366 struct sfm_reader *r;
368 /* Create and initialize reader. */
369 r = xzalloc (sizeof *r);
370 r->pool = pool_create ();
371 pool_register (r->pool, free, r);
373 r->opcode_idx = sizeof r->opcodes;
375 /* TRANSLATORS: this fragment will be interpolated into
376 messages in fh_lock() that identify types of files. */
377 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
381 r->file = fn_open (fh_get_file_name (fh), "rb");
384 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
385 fh_get_file_name (r->fh), strerror (errno));
389 if (!read_dictionary (r))
399 read_dictionary (struct sfm_reader *r)
401 size_t allocated_vars;
402 size_t allocated_labels;
404 if (!read_header (r, &r->info, &r->header))
408 allocated_labels = 0;
413 if (!read_int (r, &type))
417 if (!read_record (r, type, &allocated_vars, &allocated_labels))
421 if (!skip_bytes (r, 4))
424 if (r->compression == SFM_COMP_ZLIB && !read_zheader (r))
431 read_record (struct sfm_reader *r, int type,
432 size_t *allocated_vars, size_t *allocated_labels)
439 if (r->n_vars >= *allocated_vars)
440 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
442 return read_variable_record (r, &r->vars[r->n_vars++]);
445 if (r->n_labels >= *allocated_labels)
446 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
448 return read_value_label_record (r, &r->labels[r->n_labels++]);
451 /* A Type 4 record is always immediately after a type 3 record,
452 so the code for type 3 records reads the type 4 record too. */
453 sys_error (r, r->pos, _("Misplaced type 4 record."));
457 if (r->document != NULL)
459 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
462 r->document = read_document_record (r);
463 return r->document != NULL;
466 if (!read_int (r, &subtype))
469 || subtype >= sizeof r->extensions / sizeof *r->extensions)
472 _("Unrecognized record type 7, subtype %d. Please "
473 "send a copy of this file, and the syntax which "
474 "created it to %s."),
475 subtype, PACKAGE_BUGREPORT);
476 return skip_extension_record (r, subtype);
478 else if (r->extensions[subtype] != NULL)
481 _("Record type 7, subtype %d found here has the same "
482 "type as the record found near offset 0x%llx. "
483 "Please send a copy of this file, and the syntax "
484 "which created it to %s."),
485 subtype, (long long int) r->extensions[subtype]->pos,
487 return skip_extension_record (r, subtype);
490 return read_extension_record (r, subtype, &r->extensions[subtype]);
493 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
500 /* Returns the character encoding obtained from R, or a null pointer if R
501 doesn't have an indication of its character encoding. */
503 sfm_get_encoding (const struct sfm_reader *r)
505 /* The EXT_ENCODING record is the best way to determine dictionary
507 if (r->extensions[EXT_ENCODING])
508 return r->extensions[EXT_ENCODING]->data;
510 /* But EXT_INTEGER is better than nothing as a fallback. */
511 if (r->extensions[EXT_INTEGER])
513 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
514 const char *encoding;
523 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
524 respectively. However, many files have character code 2 but data
525 which are clearly not ASCII. Therefore, ignore these values. */
532 encoding = sys_get_encoding_from_codepage (codepage);
533 if (encoding != NULL)
539 /* If the file magic number is EBCDIC then its character data is too. */
540 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
546 /* Decodes the dictionary read from R, saving it into into *DICT. Character
547 strings in R are decoded using ENCODING, or an encoding obtained from R if
548 ENCODING is null, or the locale encoding if R specifies no encoding.
550 If INFOP is non-null, then it receives additional info about the system
551 file, which the caller must eventually free with sfm_read_info_destroy()
552 when it is no longer needed.
554 This function consumes R. The caller must use it again later, even to
555 destroy it with sfm_close(). */
557 sfm_decode (struct sfm_reader *r, const char *encoding,
558 struct dictionary **dictp, struct sfm_read_info *infop)
560 struct dictionary *dict;
563 if (encoding == NULL)
565 encoding = sfm_get_encoding (r);
566 if (encoding == NULL)
567 encoding = locale_charset ();
570 dict = dict_create (encoding);
571 r->encoding = dict_get_encoding (dict);
573 /* These records don't use variables at all. */
574 if (r->document != NULL)
575 parse_document (dict, r->document);
577 if (r->extensions[EXT_INTEGER] != NULL
578 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
581 if (r->extensions[EXT_FLOAT] != NULL)
582 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
584 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
585 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
587 if (r->extensions[EXT_FILE_ATTRS] != NULL)
588 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
590 parse_header (r, &r->header, &r->info, dict);
592 /* Parse the variable records, the basis of almost everything else. */
593 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
596 /* Parse value labels and the weight variable immediately after the variable
597 records. These records use indexes into var_recs[], so we must parse them
598 before those indexes become invalidated by very long string variables. */
599 for (i = 0; i < r->n_labels; i++)
600 if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i]))
602 if (r->header.weight_idx != 0)
604 struct variable *weight_var;
606 weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars,
607 r->header.weight_idx);
608 if (weight_var != NULL)
610 if (var_is_numeric (weight_var))
611 dict_set_weight (dict, weight_var);
613 sys_warn (r, -1, _("Ignoring string variable `%s' set "
614 "as weighting variable."),
615 var_get_name (weight_var));
619 if (r->extensions[EXT_DISPLAY] != NULL)
620 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
622 /* The following records use short names, so they need to be parsed before
623 parse_long_var_name_map() changes short names to long names. */
624 if (r->extensions[EXT_MRSETS] != NULL)
625 parse_mrsets (r, r->extensions[EXT_MRSETS], dict);
627 if (r->extensions[EXT_MRSETS2] != NULL)
628 parse_mrsets (r, r->extensions[EXT_MRSETS2], dict);
630 if (r->extensions[EXT_LONG_STRINGS] != NULL
631 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
634 /* Now rename variables to their long names. */
635 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
637 /* The following records use long names, so they need to follow renaming. */
638 if (r->extensions[EXT_VAR_ATTRS] != NULL)
640 parse_variable_attributes (r, r->extensions[EXT_VAR_ATTRS], dict);
642 /* Roles use the $@Role attribute. */
643 assign_variable_roles (r, dict);
646 if (r->extensions[EXT_LONG_LABELS] != NULL
647 && !parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS],
650 if (r->extensions[EXT_LONG_MISSING] != NULL
651 && !parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
655 /* Warn if the actual amount of data per case differs from the
656 amount that the header claims. SPSS version 13 gets this
657 wrong when very long strings are involved, so don't warn in
659 if (r->header.nominal_case_size != -1
660 && r->header.nominal_case_size != r->n_vars
661 && r->info.version_major != 13)
662 sys_warn (r, -1, _("File header claims %d variable positions but "
663 "%zu were read from file."),
664 r->header.nominal_case_size, r->n_vars);
666 /* Create an index of dictionary variable widths for
667 sfm_read_case to use. We cannot use the `struct variable's
668 from the dictionary we created, because the caller owns the
669 dictionary and may destroy or modify its variables. */
670 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
671 pool_register (r->pool, free, r->sfm_vars);
672 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
678 memset (&r->info, 0, sizeof r->info);
681 return casereader_create_sequential
683 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
684 &sys_file_casereader_class, r);
693 /* Closes R, which should have been returned by sfm_open() but not already
694 closed with sfm_decode() or this function.
695 Returns true if an I/O error has occurred on READER, false
698 sfm_close (struct sfm_reader *r)
707 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
709 msg (ME, _("Error closing system file `%s': %s."),
710 fh_get_file_name (r->fh), strerror (errno));
716 sfm_read_info_destroy (&r->info);
721 pool_destroy (r->pool);
726 /* Destroys READER. */
728 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
730 struct sfm_reader *r = r_;
734 /* Returns true if FILE is an SPSS system file,
737 sfm_detect (FILE *file)
741 if (fread (magic, 4, 1, file) != 1)
745 return (!strcmp (ASCII_MAGIC, magic)
746 || !strcmp (ASCII_ZMAGIC, magic)
747 || !strcmp (EBCDIC_MAGIC, magic));
750 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
751 except for the string fields in *INFO, which parse_header() will initialize
752 later once the file's encoding is known. */
754 read_header (struct sfm_reader *r, struct sfm_read_info *info,
755 struct sfm_header_record *header)
757 uint8_t raw_layout_code[4];
762 if (!read_string (r, header->magic, sizeof header->magic)
763 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
766 if (!strcmp (ASCII_MAGIC, header->magic)
767 || !strcmp (EBCDIC_MAGIC, header->magic))
769 else if (!strcmp (ASCII_ZMAGIC, header->magic))
773 sys_error (r, 0, _("This is not an SPSS system file."));
777 /* Identify integer format. */
778 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
780 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
782 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
784 || (r->integer_format != INTEGER_MSB_FIRST
785 && r->integer_format != INTEGER_LSB_FIRST))
787 sys_error (r, 64, _("This is not an SPSS system file."));
791 if (!read_int (r, &header->nominal_case_size))
794 if (header->nominal_case_size < 0
795 || header->nominal_case_size > INT_MAX / 16)
796 header->nominal_case_size = -1;
798 if (!read_int (r, &compressed))
803 r->compression = SFM_COMP_NONE;
804 else if (compressed == 1)
805 r->compression = SFM_COMP_SIMPLE;
806 else if (compressed != 0)
808 sys_error (r, 0, "System file header has invalid compression "
809 "value %d.", compressed);
816 r->compression = SFM_COMP_ZLIB;
819 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
820 "compression value %d.", compressed);
825 if (!read_int (r, &header->weight_idx))
828 if (!read_int (r, &r->case_cnt))
830 if ( r->case_cnt > INT_MAX / 2)
833 /* Identify floating-point format and obtain compression bias. */
834 if (!read_bytes (r, raw_bias, sizeof raw_bias))
836 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
838 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
840 if (memcmp (raw_bias, zero_bias, 8))
841 sys_warn (r, r->pos - 8,
842 _("Compression bias is not the usual "
843 "value of 100, or system file uses unrecognized "
844 "floating-point format."));
847 /* Some software is known to write all-zeros to this
848 field. Such software also writes floating-point
849 numbers in the format that we expect by default
850 (it seems that all software most likely does, in
851 reality), so don't warn in this case. */
854 if (r->integer_format == INTEGER_MSB_FIRST)
855 r->float_format = FLOAT_IEEE_DOUBLE_BE;
857 r->float_format = FLOAT_IEEE_DOUBLE_LE;
859 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
861 if (!read_string (r, header->creation_date, sizeof header->creation_date)
862 || !read_string (r, header->creation_time, sizeof header->creation_time)
863 || !read_string (r, header->file_label, sizeof header->file_label)
864 || !skip_bytes (r, 3))
867 info->integer_format = r->integer_format;
868 info->float_format = r->float_format;
869 info->compression = r->compression;
870 info->case_cnt = r->case_cnt;
875 /* Reads a variable (type 2) record from R into RECORD. */
877 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
879 int has_variable_label;
881 memset (record, 0, sizeof *record);
883 record->pos = r->pos;
884 if (!read_int (r, &record->width)
885 || !read_int (r, &has_variable_label)
886 || !read_int (r, &record->missing_value_code)
887 || !read_int (r, &record->print_format)
888 || !read_int (r, &record->write_format)
889 || !read_bytes (r, record->name, sizeof record->name))
892 if (has_variable_label == 1)
894 enum { MAX_LABEL_LEN = 255 };
895 unsigned int len, read_len;
897 if (!read_uint (r, &len))
900 /* Read up to MAX_LABEL_LEN bytes of label. */
901 read_len = MIN (MAX_LABEL_LEN, len);
902 record->label = pool_malloc (r->pool, read_len + 1);
903 if (!read_string (r, record->label, read_len + 1))
906 /* Skip unread label bytes. */
907 if (!skip_bytes (r, len - read_len))
910 /* Skip label padding up to multiple of 4 bytes. */
911 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
914 else if (has_variable_label != 0)
916 sys_error (r, record->pos,
917 _("Variable label indicator field is not 0 or 1."));
921 /* Set missing values. */
922 if (record->missing_value_code != 0)
924 int code = record->missing_value_code;
925 if (record->width == 0)
927 if (code < -3 || code > 3 || code == -1)
929 sys_error (r, record->pos,
930 _("Numeric missing value indicator field is not "
931 "-3, -2, 0, 1, 2, or 3."));
937 if (code < 1 || code > 3)
939 sys_error (r, record->pos,
940 _("String missing value indicator field is not "
946 if (!read_bytes (r, record->missing, 8 * abs (code)))
953 /* Reads value labels from R into RECORD. */
955 read_value_label_record (struct sfm_reader *r,
956 struct sfm_value_label_record *record)
961 /* Read type 3 record. */
962 record->pos = r->pos;
963 if (!read_uint (r, &record->n_labels))
965 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
967 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
971 record->labels = pool_nmalloc (r->pool, record->n_labels,
972 sizeof *record->labels);
973 for (i = 0; i < record->n_labels; i++)
975 struct sfm_value_label *label = &record->labels[i];
976 unsigned char label_len;
979 if (!read_bytes (r, label->value, sizeof label->value))
982 /* Read label length. */
983 if (!read_bytes (r, &label_len, sizeof label_len))
985 padded_len = ROUND_UP (label_len + 1, 8);
987 /* Read label, padding. */
988 label->label = pool_malloc (r->pool, padded_len + 1);
989 if (!read_bytes (r, label->label, padded_len - 1))
991 label->label[label_len] = '\0';
994 /* Read record type of type 4 record. */
995 if (!read_int (r, &type))
999 sys_error (r, r->pos - 4,
1000 _("Variable index record (type 4) does not immediately "
1001 "follow value label record (type 3) as it should."));
1005 /* Read number of variables associated with value label from type 4
1007 if (!read_uint (r, &record->n_vars))
1009 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1011 sys_error (r, r->pos - 4,
1012 _("Number of variables associated with a value label (%zu) "
1013 "is not between 1 and the number of variables (%zu)."),
1014 record->n_vars, r->n_vars);
1018 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1019 for (i = 0; i < record->n_vars; i++)
1020 if (!read_int (r, &record->vars[i]))
1026 /* Reads a document record from R and returns it. */
1027 static struct sfm_document_record *
1028 read_document_record (struct sfm_reader *r)
1030 struct sfm_document_record *record;
1033 record = pool_malloc (r->pool, sizeof *record);
1034 record->pos = r->pos;
1036 if (!read_int (r, &n_lines))
1038 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1040 sys_error (r, record->pos,
1041 _("Number of document lines (%d) "
1042 "must be greater than 0 and less than %d."),
1043 n_lines, INT_MAX / DOC_LINE_LENGTH);
1047 record->n_lines = n_lines;
1048 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1049 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1056 read_extension_record_header (struct sfm_reader *r, int subtype,
1057 struct sfm_extension_record *record)
1059 record->subtype = subtype;
1060 record->pos = r->pos;
1061 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1064 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1065 allows an extra byte for a null terminator, used by some
1066 extension processing routines. */
1067 if (record->size != 0
1068 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1070 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1078 /* Reads an extension record from R into RECORD. */
1080 read_extension_record (struct sfm_reader *r, int subtype,
1081 struct sfm_extension_record **recordp)
1083 struct extension_record_type
1090 static const struct extension_record_type types[] =
1092 /* Implemented record types. */
1093 { EXT_INTEGER, 4, 8 },
1094 { EXT_FLOAT, 8, 3 },
1095 { EXT_MRSETS, 1, 0 },
1096 { EXT_PRODUCT_INFO, 1, 0 },
1097 { EXT_DISPLAY, 4, 0 },
1098 { EXT_LONG_NAMES, 1, 0 },
1099 { EXT_LONG_STRINGS, 1, 0 },
1100 { EXT_NCASES, 8, 2 },
1101 { EXT_FILE_ATTRS, 1, 0 },
1102 { EXT_VAR_ATTRS, 1, 0 },
1103 { EXT_MRSETS2, 1, 0 },
1104 { EXT_ENCODING, 1, 0 },
1105 { EXT_LONG_LABELS, 1, 0 },
1106 { EXT_LONG_MISSING, 1, 0 },
1108 /* Ignored record types. */
1109 { EXT_VAR_SETS, 0, 0 },
1111 { EXT_DATA_ENTRY, 0, 0 },
1112 { EXT_DATAVIEW, 0, 0 },
1115 const struct extension_record_type *type;
1116 struct sfm_extension_record *record;
1120 record = pool_malloc (r->pool, sizeof *record);
1121 if (!read_extension_record_header (r, subtype, record))
1123 n_bytes = record->count * record->size;
1125 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1126 if (subtype == type->subtype)
1128 if (type->size > 0 && record->size != type->size)
1129 sys_warn (r, record->pos,
1130 _("Record type 7, subtype %d has bad size %zu "
1131 "(expected %d)."), subtype, record->size, type->size);
1132 else if (type->count > 0 && record->count != type->count)
1133 sys_warn (r, record->pos,
1134 _("Record type 7, subtype %d has bad count %zu "
1135 "(expected %d)."), subtype, record->count, type->count);
1136 else if (type->count == 0 && type->size == 0)
1138 /* Ignore this record. */
1142 char *data = pool_malloc (r->pool, n_bytes + 1);
1143 data[n_bytes] = '\0';
1145 record->data = data;
1146 if (!read_bytes (r, record->data, n_bytes))
1155 sys_warn (r, record->pos,
1156 _("Unrecognized record type 7, subtype %d. Please send a "
1157 "copy of this file, and the syntax which created it to %s."),
1158 subtype, PACKAGE_BUGREPORT);
1161 return skip_bytes (r, n_bytes);
1165 skip_extension_record (struct sfm_reader *r, int subtype)
1167 struct sfm_extension_record record;
1169 return (read_extension_record_header (r, subtype, &record)
1170 && skip_bytes (r, record.count * record.size));
1174 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1175 struct sfm_read_info *info, struct dictionary *dict)
1177 const char *dict_encoding = dict_get_encoding (dict);
1178 struct substring product;
1179 struct substring label;
1182 /* Convert file label to UTF-8 and put it into DICT. */
1183 label = recode_substring_pool ("UTF-8", dict_encoding,
1184 ss_cstr (header->file_label), r->pool);
1185 ss_trim (&label, ss_cstr (" "));
1186 label.string[label.length] = '\0';
1187 fixed_label = fix_line_ends (label.string);
1188 dict_set_label (dict, fixed_label);
1191 /* Put creation date and time in UTF-8 into INFO. */
1192 info->creation_date = recode_string ("UTF-8", dict_encoding,
1193 header->creation_date, -1);
1194 info->creation_time = recode_string ("UTF-8", dict_encoding,
1195 header->creation_time, -1);
1197 /* Put product name into INFO, dropping eye-catcher string if present. */
1198 product = recode_substring_pool ("UTF-8", dict_encoding,
1199 ss_cstr (header->eye_catcher), r->pool);
1200 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1201 ss_trim (&product, ss_cstr (" "));
1202 info->product = ss_xstrdup (product);
1205 /* Reads a variable (type 2) record from R and adds the
1206 corresponding variable to DICT.
1207 Also skips past additional variable records for long string
1210 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1211 struct sfm_var_record *var_recs, size_t n_var_recs)
1213 const char *dict_encoding = dict_get_encoding (dict);
1214 struct sfm_var_record *rec;
1217 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1219 struct variable *var;
1224 name = recode_string_pool ("UTF-8", dict_encoding,
1225 rec->name, 8, r->pool);
1226 name[strcspn (name, " ")] = '\0';
1228 if (!dict_id_is_valid (dict, name, false)
1229 || name[0] == '$' || name[0] == '#')
1231 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1235 if (rec->width < 0 || rec->width > 255)
1237 sys_error (r, rec->pos,
1238 _("Bad width %d for variable %s."), rec->width, name);
1242 var = rec->var = dict_create_var (dict, name, rec->width);
1245 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1246 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1249 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1253 /* Set the short name the same as the long name. */
1254 var_set_short_name (var, 0, name);
1256 /* Get variable label, if any. */
1261 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1262 rec->label, -1, r->pool);
1263 var_set_label (var, utf8_label, false);
1266 /* Set missing values. */
1267 if (rec->missing_value_code != 0)
1269 int width = var_get_width (var);
1270 struct missing_values mv;
1272 mv_init_pool (r->pool, &mv, width);
1273 if (var_is_numeric (var))
1275 bool has_range = rec->missing_value_code < 0;
1276 int n_discrete = (has_range
1277 ? rec->missing_value_code == -3
1278 : rec->missing_value_code);
1283 double low = parse_float (r, rec->missing, 0);
1284 double high = parse_float (r, rec->missing, 8);
1286 /* Deal with SPSS 21 change in representation. */
1290 mv_add_range (&mv, low, high);
1294 for (i = 0; i < n_discrete; i++)
1296 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1304 value_init_pool (r->pool, &value, width);
1305 value_set_missing (&value, width);
1306 for (i = 0; i < rec->missing_value_code; i++)
1307 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1309 var_set_missing_values (var, &mv);
1313 parse_format_spec (r, rec->pos + 12, rec->print_format,
1314 PRINT_FORMAT, var, &n_warnings);
1315 parse_format_spec (r, rec->pos + 16, rec->write_format,
1316 WRITE_FORMAT, var, &n_warnings);
1318 /* Account for values.
1319 Skip long string continuation records, if any. */
1320 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1321 for (i = 1; i < n_values; i++)
1322 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1324 sys_error (r, rec->pos, _("Missing string continuation record."));
1333 /* Translates the format spec from sysfile format to internal
1336 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1337 enum which_format which, struct variable *v,
1340 const int max_warnings = 8;
1341 uint8_t raw_type = format >> 16;
1342 uint8_t w = format >> 8;
1351 ok = (fmt_from_io (raw_type, &f.type)
1352 && fmt_check_output (&f)
1353 && fmt_check_width_compat (&f, var_get_width (v)));
1358 if (which == PRINT_FORMAT)
1359 var_set_print_format (v, &f);
1361 var_set_write_format (v, &f);
1363 else if (format == 0)
1365 /* Actually observed in the wild. No point in warning about it. */
1367 else if (++*n_warnings <= max_warnings)
1369 if (which == PRINT_FORMAT)
1370 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1372 var_get_name (v), var_get_width (v), format);
1374 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1376 var_get_name (v), var_get_width (v), format);
1378 if (*n_warnings == max_warnings)
1379 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1384 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1388 for (p = record->documents;
1389 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1390 p += DOC_LINE_LENGTH)
1392 struct substring line;
1394 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1395 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1396 ss_rtrim (&line, ss_cstr (" "));
1397 line.string[line.length] = '\0';
1399 dict_add_document_line (dict, line.string, false);
1405 /* Parses record type 7, subtype 3. */
1407 parse_machine_integer_info (struct sfm_reader *r,
1408 const struct sfm_extension_record *record,
1409 struct sfm_read_info *info)
1411 int float_representation, expected_float_format;
1412 int integer_representation, expected_integer_format;
1414 /* Save version info. */
1415 info->version_major = parse_int (r, record->data, 0);
1416 info->version_minor = parse_int (r, record->data, 4);
1417 info->version_revision = parse_int (r, record->data, 8);
1419 /* Check floating point format. */
1420 float_representation = parse_int (r, record->data, 16);
1421 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1422 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1423 expected_float_format = 1;
1424 else if (r->float_format == FLOAT_Z_LONG)
1425 expected_float_format = 2;
1426 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1427 expected_float_format = 3;
1430 if (float_representation != expected_float_format)
1432 sys_error (r, record->pos,
1433 _("Floating-point representation indicated by "
1434 "system file (%d) differs from expected (%d)."),
1435 float_representation, expected_float_format);
1439 /* Check integer format. */
1440 integer_representation = parse_int (r, record->data, 24);
1441 if (r->integer_format == INTEGER_MSB_FIRST)
1442 expected_integer_format = 1;
1443 else if (r->integer_format == INTEGER_LSB_FIRST)
1444 expected_integer_format = 2;
1447 if (integer_representation != expected_integer_format)
1448 sys_warn (r, record->pos,
1449 _("Integer format indicated by system file (%d) "
1450 "differs from expected (%d)."),
1451 integer_representation, expected_integer_format);
1456 /* Parses record type 7, subtype 4. */
1458 parse_machine_float_info (struct sfm_reader *r,
1459 const struct sfm_extension_record *record)
1461 double sysmis = parse_float (r, record->data, 0);
1462 double highest = parse_float (r, record->data, 8);
1463 double lowest = parse_float (r, record->data, 16);
1465 if (sysmis != SYSMIS)
1466 sys_warn (r, record->pos,
1467 _("File specifies unexpected value %g (%a) as %s, "
1468 "instead of %g (%a)."),
1469 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1471 if (highest != HIGHEST)
1472 sys_warn (r, record->pos,
1473 _("File specifies unexpected value %g (%a) as %s, "
1474 "instead of %g (%a)."),
1475 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1477 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1478 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1479 appears in a context (missing values) where SYSMIS cannot. */
1480 if (lowest != LOWEST && lowest != SYSMIS)
1481 sys_warn (r, record->pos,
1482 _("File specifies unexpected value %g (%a) as %s, "
1483 "instead of %g (%a) or %g (%a)."),
1484 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1487 /* Parses record type 7, subtype 10. */
1489 parse_extra_product_info (struct sfm_reader *r,
1490 const struct sfm_extension_record *record,
1491 struct sfm_read_info *info)
1493 struct text_record *text;
1495 text = open_text_record (r, record, true);
1496 info->product_ext = fix_line_ends (text_get_all (text));
1497 close_text_record (r, text);
1500 /* Parses record type 7, subtype 7 or 19. */
1502 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1503 struct dictionary *dict)
1505 struct text_record *text;
1506 struct mrset *mrset;
1508 text = open_text_record (r, record, false);
1511 const char *counted = NULL;
1514 struct stringi_set var_names;
1515 size_t allocated_vars;
1519 /* Skip extra line feeds if present. */
1520 while (text_match (text, '\n'))
1523 mrset = xzalloc (sizeof *mrset);
1525 name = text_get_token (text, ss_cstr ("="), NULL);
1528 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1530 if (mrset->name[0] != '$')
1532 sys_warn (r, record->pos,
1533 _("`%s' does not begin with `$' at offset %zu "
1534 "in MRSETS record."), mrset->name, text_pos (text));
1538 if (text_match (text, 'C'))
1540 mrset->type = MRSET_MC;
1541 if (!text_match (text, ' '))
1543 sys_warn (r, record->pos,
1544 _("Missing space following `%c' at offset %zu "
1545 "in MRSETS record."), 'C', text_pos (text));
1549 else if (text_match (text, 'D'))
1551 mrset->type = MRSET_MD;
1552 mrset->cat_source = MRSET_VARLABELS;
1554 else if (text_match (text, 'E'))
1558 mrset->type = MRSET_MD;
1559 mrset->cat_source = MRSET_COUNTEDVALUES;
1560 if (!text_match (text, ' '))
1562 sys_warn (r, record->pos,
1563 _("Missing space following `%c' at offset %zu "
1564 "in MRSETS record."), 'E', text_pos (text));
1568 number = text_get_token (text, ss_cstr (" "), NULL);
1569 if (!strcmp (number, "11"))
1570 mrset->label_from_var_label = true;
1571 else if (strcmp (number, "1"))
1572 sys_warn (r, record->pos,
1573 _("Unexpected label source value `%s' following `E' "
1574 "at offset %zu in MRSETS record."),
1575 number, text_pos (text));
1579 sys_warn (r, record->pos,
1580 _("Missing `C', `D', or `E' at offset %zu "
1581 "in MRSETS record."),
1586 if (mrset->type == MRSET_MD)
1588 counted = text_parse_counted_string (r, text);
1589 if (counted == NULL)
1593 label = text_parse_counted_string (r, text);
1596 if (label[0] != '\0')
1597 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1599 stringi_set_init (&var_names);
1604 const char *raw_var_name;
1605 struct variable *var;
1608 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1609 if (raw_var_name == NULL)
1611 if (delimiter != '\n')
1612 sys_warn (r, record->pos,
1613 _("Missing new-line parsing variable names "
1614 "at offset %zu in MRSETS record."),
1618 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1620 var = dict_lookup_var (dict, var_name);
1626 if (!stringi_set_insert (&var_names, var_name))
1628 sys_warn (r, record->pos,
1629 _("Duplicate variable name %s "
1630 "at offset %zu in MRSETS record."),
1631 var_name, text_pos (text));
1637 if (mrset->label == NULL && mrset->label_from_var_label
1638 && var_has_label (var))
1639 mrset->label = xstrdup (var_get_label (var));
1642 && var_get_type (var) != var_get_type (mrset->vars[0]))
1644 sys_warn (r, record->pos,
1645 _("MRSET %s contains both string and "
1646 "numeric variables."), name);
1649 width = MIN (width, var_get_width (var));
1651 if (mrset->n_vars >= allocated_vars)
1652 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1653 sizeof *mrset->vars);
1654 mrset->vars[mrset->n_vars++] = var;
1656 while (delimiter != '\n');
1658 if (mrset->n_vars < 2)
1660 sys_warn (r, record->pos,
1661 _("MRSET %s has only %zu variables."), mrset->name,
1663 mrset_destroy (mrset);
1664 stringi_set_destroy (&var_names);
1668 if (mrset->type == MRSET_MD)
1670 mrset->width = width;
1671 value_init (&mrset->counted, width);
1673 mrset->counted.f = c_strtod (counted, NULL);
1675 value_copy_str_rpad (&mrset->counted, width,
1676 (const uint8_t *) counted, ' ');
1679 dict_add_mrset (dict, mrset);
1681 stringi_set_destroy (&var_names);
1683 mrset_destroy (mrset);
1684 close_text_record (r, text);
1687 /* Read record type 7, subtype 11, which specifies how variables
1688 should be displayed in GUI environments. */
1690 parse_display_parameters (struct sfm_reader *r,
1691 const struct sfm_extension_record *record,
1692 struct dictionary *dict)
1694 bool includes_width;
1695 bool warned = false;
1700 n_vars = dict_get_var_cnt (dict);
1701 if (record->count == 3 * n_vars)
1702 includes_width = true;
1703 else if (record->count == 2 * n_vars)
1704 includes_width = false;
1707 sys_warn (r, record->pos,
1708 _("Extension 11 has bad count %zu (for %zu variables)."),
1709 record->count, n_vars);
1714 for (i = 0; i < n_vars; ++i)
1716 struct variable *v = dict_get_var (dict, i);
1717 int measure, width, align;
1719 measure = parse_int (r, record->data, ofs);
1724 width = parse_int (r, record->data, ofs);
1730 align = parse_int (r, record->data, ofs);
1733 /* SPSS sometimes seems to set variables' measure to zero. */
1737 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1740 sys_warn (r, record->pos,
1741 _("Invalid variable display parameters for variable "
1742 "%zu (%s). Default parameters substituted."),
1743 i, var_get_name (v));
1748 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1749 : measure == 2 ? MEASURE_ORDINAL
1751 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1752 : align == 1 ? ALIGN_RIGHT
1755 /* Older versions (SPSS 9.0) sometimes set the display
1756 width to zero. This causes confusion in the GUI, so
1757 only set the width if it is nonzero. */
1759 var_set_display_width (v, width);
1764 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1765 const char *new_name)
1767 size_t n_short_names;
1771 /* Renaming a variable may clear its short names, but we
1772 want to retain them, so we save them and re-set them
1774 n_short_names = var_get_short_name_cnt (var);
1775 short_names = xnmalloc (n_short_names, sizeof *short_names);
1776 for (i = 0; i < n_short_names; i++)
1778 const char *s = var_get_short_name (var, i);
1779 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1782 /* Set long name. */
1783 dict_rename_var (dict, var, new_name);
1785 /* Restore short names. */
1786 for (i = 0; i < n_short_names; i++)
1788 var_set_short_name (var, i, short_names[i]);
1789 free (short_names[i]);
1794 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1795 to each short name. Modifies variable names in DICT accordingly. */
1797 parse_long_var_name_map (struct sfm_reader *r,
1798 const struct sfm_extension_record *record,
1799 struct dictionary *dict)
1801 struct text_record *text;
1802 struct variable *var;
1807 /* There are no long variable names. Use the short variable names,
1808 converted to lowercase, as the long variable names. */
1811 for (i = 0; i < dict_get_var_cnt (dict); i++)
1813 struct variable *var = dict_get_var (dict, i);
1816 new_name = utf8_to_lower (var_get_name (var));
1817 rename_var_and_save_short_names (dict, var, new_name);
1824 /* Rename each of the variables, one by one. (In a correctly constructed
1825 system file, this cannot create any intermediate duplicate variable names,
1826 because all of the new variable names are longer than any of the old
1827 variable names and thus there cannot be any overlaps.) */
1828 text = open_text_record (r, record, true);
1829 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1831 /* Validate long name. */
1832 if (!dict_id_is_valid (dict, long_name, false))
1834 sys_warn (r, record->pos,
1835 _("Long variable mapping from %s to invalid "
1836 "variable name `%s'."),
1837 var_get_name (var), long_name);
1841 /* Identify any duplicates. */
1842 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1843 && dict_lookup_var (dict, long_name) != NULL)
1845 sys_warn (r, record->pos,
1846 _("Duplicate long variable name `%s'."), long_name);
1850 rename_var_and_save_short_names (dict, var, long_name);
1852 close_text_record (r, text);
1855 /* Reads record type 7, subtype 14, which gives the real length
1856 of each very long string. Rearranges DICT accordingly. */
1858 parse_long_string_map (struct sfm_reader *r,
1859 const struct sfm_extension_record *record,
1860 struct dictionary *dict)
1862 struct text_record *text;
1863 struct variable *var;
1866 text = open_text_record (r, record, true);
1867 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1869 size_t idx = var_get_dict_index (var);
1875 length = strtol (length_s, NULL, 10);
1876 if (length < 1 || length > MAX_STRING)
1878 sys_warn (r, record->pos,
1879 _("%s listed as string of invalid length %s "
1880 "in very long string record."),
1881 var_get_name (var), length_s);
1885 /* Check segments. */
1886 segment_cnt = sfm_width_to_segments (length);
1887 if (segment_cnt == 1)
1889 sys_warn (r, record->pos,
1890 _("%s listed in very long string record with width %s, "
1891 "which requires only one segment."),
1892 var_get_name (var), length_s);
1895 if (idx + segment_cnt > dict_get_var_cnt (dict))
1897 sys_error (r, record->pos,
1898 _("Very long string %s overflows dictionary."),
1899 var_get_name (var));
1903 /* Get the short names from the segments and check their
1905 for (i = 0; i < segment_cnt; i++)
1907 struct variable *seg = dict_get_var (dict, idx + i);
1908 int alloc_width = sfm_segment_alloc_width (length, i);
1909 int width = var_get_width (seg);
1912 var_set_short_name (var, i, var_get_short_name (seg, 0));
1913 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1915 sys_error (r, record->pos,
1916 _("Very long string with width %ld has segment %d "
1917 "of width %d (expected %d)."),
1918 length, i, width, alloc_width);
1922 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1923 var_set_width (var, length);
1925 close_text_record (r, text);
1926 dict_compact_values (dict);
1932 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1933 const struct sfm_var_record *var_recs, size_t n_var_recs,
1934 const struct sfm_value_label_record *record)
1936 struct variable **vars;
1940 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
1941 for (i = 0; i < record->n_labels; i++)
1942 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1943 record->labels[i].label, -1,
1946 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1947 for (i = 0; i < record->n_vars; i++)
1949 vars[i] = lookup_var_by_index (r, record->pos,
1950 var_recs, n_var_recs, record->vars[i]);
1951 if (vars[i] == NULL)
1955 for (i = 1; i < record->n_vars; i++)
1956 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1958 sys_error (r, record->pos,
1959 _("Variables associated with value label are not all of "
1960 "identical type. Variable %s is %s, but variable "
1962 var_get_name (vars[0]),
1963 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1964 var_get_name (vars[i]),
1965 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1969 for (i = 0; i < record->n_vars; i++)
1971 struct variable *var = vars[i];
1975 width = var_get_width (var);
1978 sys_error (r, record->pos,
1979 _("Value labels may not be added to long string "
1980 "variables (e.g. %s) using records types 3 and 4."),
1981 var_get_name (var));
1985 for (j = 0; j < record->n_labels; j++)
1987 struct sfm_value_label *label = &record->labels[j];
1990 value_init (&value, width);
1992 value.f = parse_float (r, label->value, 0);
1994 memcpy (value_str_rw (&value, width), label->value, width);
1996 if (!var_add_value_label (var, &value, utf8_labels[j]))
1998 if (var_is_numeric (var))
1999 sys_warn (r, record->pos,
2000 _("Duplicate value label for %g on %s."),
2001 value.f, var_get_name (var));
2003 sys_warn (r, record->pos,
2004 _("Duplicate value label for `%.*s' on %s."),
2005 width, value_str (&value, width),
2006 var_get_name (var));
2009 value_destroy (&value, width);
2013 pool_free (r->pool, vars);
2014 for (i = 0; i < record->n_labels; i++)
2015 pool_free (r->pool, utf8_labels[i]);
2016 pool_free (r->pool, utf8_labels);
2021 static struct variable *
2022 lookup_var_by_index (struct sfm_reader *r, off_t offset,
2023 const struct sfm_var_record *var_recs, size_t n_var_recs,
2026 const struct sfm_var_record *rec;
2028 if (idx < 1 || idx > n_var_recs)
2030 sys_error (r, offset,
2031 _("Variable index %d not in valid range 1...%zu."),
2036 rec = &var_recs[idx - 1];
2037 if (rec->var == NULL)
2039 sys_error (r, offset,
2040 _("Variable index %d refers to long string continuation."),
2048 /* Parses a set of custom attributes from TEXT into ATTRS.
2049 ATTRS may be a null pointer, in which case the attributes are
2050 read but discarded. */
2052 parse_attributes (struct sfm_reader *r, struct text_record *text,
2053 struct attrset *attrs)
2057 struct attribute *attr;
2061 /* Parse the key. */
2062 key = text_get_token (text, ss_cstr ("("), NULL);
2066 attr = attribute_create (key);
2067 for (index = 1; ; index++)
2069 /* Parse the value. */
2073 value = text_get_token (text, ss_cstr ("\n"), NULL);
2076 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2081 length = strlen (value);
2082 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2084 value[length - 1] = '\0';
2085 attribute_add_value (attr, value + 1);
2090 _("Attribute value %s[%d] is not quoted: %s."),
2092 attribute_add_value (attr, value);
2095 /* Was this the last value for this attribute? */
2096 if (text_match (text, ')'))
2100 attrset_add (attrs, attr);
2102 attribute_destroy (attr);
2104 while (!text_match (text, '/'));
2107 /* Reads record type 7, subtype 17, which lists custom
2108 attributes on the data file. */
2110 parse_data_file_attributes (struct sfm_reader *r,
2111 const struct sfm_extension_record *record,
2112 struct dictionary *dict)
2114 struct text_record *text = open_text_record (r, record, true);
2115 parse_attributes (r, text, dict_get_attributes (dict));
2116 close_text_record (r, text);
2119 /* Parses record type 7, subtype 18, which lists custom
2120 attributes on individual variables. */
2122 parse_variable_attributes (struct sfm_reader *r,
2123 const struct sfm_extension_record *record,
2124 struct dictionary *dict)
2126 struct text_record *text;
2127 struct variable *var;
2129 text = open_text_record (r, record, true);
2130 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2131 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2132 close_text_record (r, text);
2136 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2138 size_t n_warnings = 0;
2141 for (i = 0; i < dict_get_var_cnt (dict); i++)
2143 struct variable *var = dict_get_var (dict, i);
2144 struct attrset *attrs = var_get_attributes (var);
2145 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2148 int value = atoi (attribute_get_value (attr, 0));
2170 role = ROLE_PARTITION;
2179 if (n_warnings++ == 0)
2180 sys_warn (r, -1, _("Invalid role for variable %s."),
2181 var_get_name (var));
2184 var_set_role (var, role);
2189 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2194 check_overflow (struct sfm_reader *r,
2195 const struct sfm_extension_record *record,
2196 size_t ofs, size_t length)
2198 size_t end = record->size * record->count;
2199 if (length >= end || ofs + length > end)
2201 sys_error (r, record->pos + end,
2202 _("Extension record subtype %d ends unexpectedly."),
2210 parse_long_string_value_labels (struct sfm_reader *r,
2211 const struct sfm_extension_record *record,
2212 struct dictionary *dict)
2214 const char *dict_encoding = dict_get_encoding (dict);
2215 size_t end = record->size * record->count;
2222 struct variable *var;
2227 /* Parse variable name length. */
2228 if (!check_overflow (r, record, ofs, 4))
2230 var_name_len = parse_int (r, record->data, ofs);
2233 /* Parse variable name, width, and number of labels. */
2234 if (!check_overflow (r, record, ofs, var_name_len + 8))
2236 var_name = recode_string_pool ("UTF-8", dict_encoding,
2237 (const char *) record->data + ofs,
2238 var_name_len, r->pool);
2239 width = parse_int (r, record->data, ofs + var_name_len);
2240 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2241 ofs += var_name_len + 8;
2243 /* Look up 'var' and validate. */
2244 var = dict_lookup_var (dict, var_name);
2246 sys_warn (r, record->pos + ofs,
2247 _("Ignoring long string value label record for "
2248 "unknown variable %s."), var_name);
2249 else if (var_is_numeric (var))
2251 sys_warn (r, record->pos + ofs,
2252 _("Ignoring long string value label record for "
2253 "numeric variable %s."), var_name);
2256 else if (width != var_get_width (var))
2258 sys_warn (r, record->pos + ofs,
2259 _("Ignoring long string value label record for variable "
2260 "%s because the record's width (%d) does not match the "
2261 "variable's width (%d)."),
2262 var_name, width, var_get_width (var));
2267 value_init_pool (r->pool, &value, width);
2268 for (i = 0; i < n_labels; i++)
2270 size_t value_length, label_length;
2271 bool skip = var == NULL;
2273 /* Parse value length. */
2274 if (!check_overflow (r, record, ofs, 4))
2276 value_length = parse_int (r, record->data, ofs);
2280 if (!check_overflow (r, record, ofs, value_length))
2284 if (value_length == width)
2285 memcpy (value_str_rw (&value, width),
2286 (const uint8_t *) record->data + ofs, width);
2289 sys_warn (r, record->pos + ofs,
2290 _("Ignoring long string value label %zu for "
2291 "variable %s, with width %d, that has bad value "
2293 i, var_get_name (var), width, value_length);
2297 ofs += value_length;
2299 /* Parse label length. */
2300 if (!check_overflow (r, record, ofs, 4))
2302 label_length = parse_int (r, record->data, ofs);
2306 if (!check_overflow (r, record, ofs, label_length))
2312 label = recode_string_pool ("UTF-8", dict_encoding,
2313 (const char *) record->data + ofs,
2314 label_length, r->pool);
2315 if (!var_add_value_label (var, &value, label))
2316 sys_warn (r, record->pos + ofs,
2317 _("Duplicate value label for `%.*s' on %s."),
2318 width, value_str (&value, width),
2319 var_get_name (var));
2320 pool_free (r->pool, label);
2322 ofs += label_length;
2330 parse_long_string_missing_values (struct sfm_reader *r,
2331 const struct sfm_extension_record *record,
2332 struct dictionary *dict)
2334 const char *dict_encoding = dict_get_encoding (dict);
2335 size_t end = record->size * record->count;
2340 struct missing_values mv;
2342 struct variable *var;
2343 int n_missing_values;
2347 /* Parse variable name length. */
2348 if (!check_overflow (r, record, ofs, 4))
2350 var_name_len = parse_int (r, record->data, ofs);
2353 /* Parse variable name. */
2354 if (!check_overflow (r, record, ofs, var_name_len + 1))
2356 var_name = recode_string_pool ("UTF-8", dict_encoding,
2357 (const char *) record->data + ofs,
2358 var_name_len, r->pool);
2359 ofs += var_name_len;
2361 /* Parse number of missing values. */
2362 n_missing_values = ((const uint8_t *) record->data)[ofs];
2363 if (n_missing_values < 1 || n_missing_values > 3)
2364 sys_warn (r, record->pos + ofs,
2365 _("Long string missing values record says variable %s "
2366 "has %d missing values, but only 1 to 3 missing values "
2368 var_name, n_missing_values);
2371 /* Look up 'var' and validate. */
2372 var = dict_lookup_var (dict, var_name);
2374 sys_warn (r, record->pos + ofs,
2375 _("Ignoring long string missing value record for "
2376 "unknown variable %s."), var_name);
2377 else if (var_is_numeric (var))
2379 sys_warn (r, record->pos + ofs,
2380 _("Ignoring long string missing value record for "
2381 "numeric variable %s."), var_name);
2386 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2387 for (i = 0; i < n_missing_values; i++)
2389 size_t value_length;
2391 /* Parse value length. */
2392 if (!check_overflow (r, record, ofs, 4))
2394 value_length = parse_int (r, record->data, ofs);
2398 if (!check_overflow (r, record, ofs, value_length))
2402 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2404 sys_warn (r, record->pos + ofs,
2405 _("Ignoring long string missing value %zu for variable "
2406 "%s, with width %d, that has bad value width %zu."),
2407 i, var_get_name (var), var_get_width (var),
2409 ofs += value_length;
2412 var_set_missing_values (var, &mv);
2420 static void partial_record (struct sfm_reader *);
2422 static void read_error (struct casereader *, const struct sfm_reader *);
2424 static bool read_case_number (struct sfm_reader *, double *);
2425 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2426 static int read_opcode (struct sfm_reader *);
2427 static bool read_compressed_number (struct sfm_reader *, double *);
2428 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2429 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2430 static bool skip_whole_strings (struct sfm_reader *, size_t);
2432 /* Reads and returns one case from READER's file. Returns a null
2433 pointer if not successful. */
2434 static struct ccase *
2435 sys_file_casereader_read (struct casereader *reader, void *r_)
2437 struct sfm_reader *r = r_;
2445 c = case_create (r->proto);
2447 for (i = 0; i < r->sfm_var_cnt; i++)
2449 struct sfm_var *sv = &r->sfm_vars[i];
2450 union value *v = case_data_rw_idx (c, sv->case_index);
2452 if (sv->var_width == 0)
2453 retval = read_case_number (r, &v->f);
2456 uint8_t *s = value_str_rw (v, sv->var_width);
2457 retval = read_case_string (r, s + sv->offset, sv->segment_width);
2460 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2462 sys_error (r, r->pos, _("File ends in partial string value."));
2474 if (r->case_cnt != -1)
2475 read_error (reader, r);
2480 /* Issues an error that R ends in a partial record. */
2482 partial_record (struct sfm_reader *r)
2484 sys_error (r, r->pos, _("File ends in partial case."));
2487 /* Issues an error that an unspecified error occurred SFM, and
2490 read_error (struct casereader *r, const struct sfm_reader *sfm)
2492 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2493 casereader_force_error (r);
2496 /* Reads a number from R and stores its value in *D.
2497 If R is compressed, reads a compressed number;
2498 otherwise, reads a number in the regular way.
2499 Returns true if successful, false if end of file is
2500 reached immediately. */
2502 read_case_number (struct sfm_reader *r, double *d)
2504 if (r->compression == SFM_COMP_NONE)
2507 if (!try_read_bytes (r, number, sizeof number))
2509 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2513 return read_compressed_number (r, d);
2516 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2517 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2518 discarded without being written to S. Reads compressed strings if S is
2519 compressed. Returns 1 if successful, 0 if end of file is reached
2520 immediately, or -1 for some kind of error. */
2522 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2524 size_t whole = ROUND_DOWN (length, 8);
2525 size_t partial = length % 8;
2529 int retval = read_whole_strings (r, s, whole);
2537 int retval = read_whole_strings (r, bounce, sizeof bounce);
2549 memcpy (s + whole, bounce, partial);
2555 /* Reads and returns the next compression opcode from R. */
2557 read_opcode (struct sfm_reader *r)
2559 assert (r->compression != SFM_COMP_NONE);
2563 if (r->opcode_idx >= sizeof r->opcodes)
2566 int retval = try_read_compressed_bytes (r, r->opcodes,
2572 opcode = r->opcodes[r->opcode_idx++];
2579 /* Reads a compressed number from R and stores its value in D.
2580 Returns true if successful, false if end of file is
2581 reached immediately. */
2583 read_compressed_number (struct sfm_reader *r, double *d)
2585 int opcode = read_opcode (r);
2593 return read_compressed_float (r, d);
2596 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2597 if (!r->corruption_warning)
2599 r->corruption_warning = true;
2600 sys_warn (r, r->pos,
2601 _("Possible compressed data corruption: "
2602 "compressed spaces appear in numeric field."));
2611 *d = opcode - r->bias;
2618 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2620 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2625 opcode = read_opcode (r);
2633 retval = read_compressed_bytes (r, dst, 8);
2634 return retval == 1 ? 1 : -1;
2637 memset (dst, ' ', 8);
2642 double value = opcode - r->bias;
2643 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2646 /* This has actually been seen "in the wild". The submitter of the
2647 file that showed that the contents decoded as spaces, but they
2648 were at the end of the field so it's possible that the null
2649 bytes just acted as null terminators. */
2651 else if (!r->corruption_warning)
2653 r->corruption_warning = true;
2654 sys_warn (r, r->pos,
2655 _("Possible compressed data corruption: "
2656 "string contains compressed integer (opcode %d)."),
2664 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2665 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2666 end of file is reached immediately, or -1 for some kind of error. */
2668 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2670 assert (length % 8 == 0);
2671 if (r->compression == SFM_COMP_NONE)
2672 return try_read_bytes (r, s, length);
2677 for (ofs = 0; ofs < length; ofs += 8)
2679 int retval = read_compressed_string (r, s + ofs);
2694 /* Skips LENGTH string bytes from R.
2695 LENGTH must be a multiple of 8.
2696 (LENGTH is also limited to 1024, but that's only because the
2697 current caller never needs more than that many bytes.)
2698 Returns true if successful, false if end of file is
2699 reached immediately. */
2701 skip_whole_strings (struct sfm_reader *r, size_t length)
2703 uint8_t buffer[1024];
2704 assert (length < sizeof buffer);
2705 return read_whole_strings (r, buffer, length);
2708 /* Helpers for reading records that contain structured text
2711 /* Maximum number of warnings to issue for a single text
2713 #define MAX_TEXT_WARNINGS 5
2718 struct substring buffer; /* Record contents. */
2719 off_t start; /* Starting offset in file. */
2720 size_t pos; /* Current position in buffer. */
2721 int n_warnings; /* Number of warnings issued or suppressed. */
2722 bool recoded; /* Recoded into UTF-8? */
2725 static struct text_record *
2726 open_text_record (struct sfm_reader *r,
2727 const struct sfm_extension_record *record,
2728 bool recode_to_utf8)
2730 struct text_record *text;
2731 struct substring raw;
2733 text = pool_alloc (r->pool, sizeof *text);
2734 raw = ss_buffer (record->data, record->size * record->count);
2735 text->start = record->pos;
2736 text->buffer = (recode_to_utf8
2737 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2740 text->n_warnings = 0;
2741 text->recoded = recode_to_utf8;
2746 /* Closes TEXT, frees its storage, and issues a final warning
2747 about suppressed warnings if necesary. */
2749 close_text_record (struct sfm_reader *r, struct text_record *text)
2751 if (text->n_warnings > MAX_TEXT_WARNINGS)
2752 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2753 text->n_warnings - MAX_TEXT_WARNINGS);
2755 pool_free (r->pool, ss_data (text->buffer));
2758 /* Reads a variable=value pair from TEXT.
2759 Looks up the variable in DICT and stores it into *VAR.
2760 Stores a null-terminated value into *VALUE. */
2762 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2763 struct text_record *text,
2764 struct variable **var, char **value)
2768 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2771 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2775 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2776 ss_buffer ("\t\0", 2));
2784 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2785 struct text_record *text, struct substring delimiters,
2786 struct variable **var)
2790 name = text_get_token (text, delimiters, NULL);
2794 *var = dict_lookup_var (dict, name);
2798 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2805 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2806 struct text_record *text, struct substring delimiters,
2807 struct variable **var)
2809 char *short_name = text_get_token (text, delimiters, NULL);
2810 if (short_name == NULL)
2813 *var = dict_lookup_var (dict, short_name);
2815 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2820 /* Displays a warning for the current file position, limiting the
2821 number to MAX_TEXT_WARNINGS for TEXT. */
2823 text_warn (struct sfm_reader *r, struct text_record *text,
2824 const char *format, ...)
2826 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2830 va_start (args, format);
2831 sys_msg (r, text->start + text->pos, MW, format, args);
2837 text_get_token (struct text_record *text, struct substring delimiters,
2840 struct substring token;
2843 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2846 end = &ss_data (token)[ss_length (token)];
2847 if (delimiter != NULL)
2850 return ss_data (token);
2853 /* Reads a integer value expressed in decimal, then a space, then a string that
2854 consists of exactly as many bytes as specified by the integer, then a space,
2855 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2856 buffer (so the caller should not free the string). */
2858 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2866 while (text->pos < text->buffer.length)
2868 int c = text->buffer.string[text->pos];
2869 if (c < '0' || c > '9')
2871 n = (n * 10) + (c - '0');
2874 if (text->pos >= text->buffer.length || start == text->pos)
2876 sys_warn (r, text->start,
2877 _("Expecting digit at offset %zu in MRSETS record."),
2882 if (!text_match (text, ' '))
2884 sys_warn (r, text->start,
2885 _("Expecting space at offset %zu in MRSETS record."),
2890 if (text->pos + n > text->buffer.length)
2892 sys_warn (r, text->start,
2893 _("%zu-byte string starting at offset %zu "
2894 "exceeds record length %zu."),
2895 n, text->pos, text->buffer.length);
2899 s = &text->buffer.string[text->pos];
2902 sys_warn (r, text->start,
2903 _("Expecting space at offset %zu following %zu-byte string."),
2913 text_match (struct text_record *text, char c)
2915 if (text->buffer.string[text->pos] == c)
2924 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2925 inside the TEXT's string. */
2927 text_pos (const struct text_record *text)
2933 text_get_all (const struct text_record *text)
2935 return text->buffer.string;
2940 /* Displays a corruption message. */
2942 sys_msg (struct sfm_reader *r, off_t offset,
2943 int class, const char *format, va_list args)
2948 ds_init_empty (&text);
2950 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2951 fh_get_file_name (r->fh), (long long int) offset);
2953 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2954 ds_put_vformat (&text, format, args);
2956 m.category = msg_class_to_category (class);
2957 m.severity = msg_class_to_severity (class);
2963 m.text = ds_cstr (&text);
2968 /* Displays a warning for offset OFFSET in the file. */
2970 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2974 va_start (args, format);
2975 sys_msg (r, offset, MW, format, args);
2979 /* Displays an error for the current file position,
2980 marks it as in an error state,
2981 and aborts reading it using longjmp. */
2983 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2987 va_start (args, format);
2988 sys_msg (r, offset, ME, format, args);
2994 /* Reads BYTE_CNT bytes into BUF.
2995 Returns 1 if exactly BYTE_CNT bytes are successfully read.
2996 Returns -1 if an I/O error or a partial read occurs.
2997 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3000 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3001 void *buf, size_t byte_cnt)
3003 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3004 r->pos += bytes_read;
3005 if (bytes_read == byte_cnt)
3007 else if (ferror (r->file))
3009 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3012 else if (!eof_is_ok || bytes_read != 0)
3014 sys_error (r, r->pos, _("Unexpected end of file."));
3021 /* Reads BYTE_CNT into BUF.
3022 Returns true if successful.
3023 Returns false upon I/O error or if end-of-file is encountered. */
3025 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3027 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3030 /* Reads BYTE_CNT bytes into BUF.
3031 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3032 Returns 0 if an immediate end-of-file is encountered.
3033 Returns -1 if an I/O error or a partial read occurs. */
3035 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3037 return read_bytes_internal (r, true, buf, byte_cnt);
3040 /* Reads a 32-bit signed integer from R and stores its value in host format in
3041 *X. Returns true if successful, otherwise false. */
3043 read_int (struct sfm_reader *r, int *x)
3046 if (read_bytes (r, integer, sizeof integer) != 1)
3048 *x = integer_get (r->integer_format, integer, sizeof integer);
3053 read_uint (struct sfm_reader *r, unsigned int *x)
3058 ok = read_int (r, &y);
3063 /* Reads a 64-bit signed integer from R and returns its value in
3066 read_int64 (struct sfm_reader *r, long long int *x)
3069 if (read_bytes (r, integer, sizeof integer) != 1)
3071 *x = integer_get (r->integer_format, integer, sizeof integer);
3075 /* Reads a 64-bit signed integer from R and returns its value in
3078 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3083 ok = read_int64 (r, &y);
3089 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3091 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3095 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3097 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3100 /* Reads exactly SIZE - 1 bytes into BUFFER
3101 and stores a null byte into BUFFER[SIZE - 1]. */
3103 read_string (struct sfm_reader *r, char *buffer, size_t size)
3108 ok = read_bytes (r, buffer, size - 1);
3110 buffer[size - 1] = '\0';
3114 /* Skips BYTES bytes forward in R. */
3116 skip_bytes (struct sfm_reader *r, size_t bytes)
3121 size_t chunk = MIN (sizeof buffer, bytes);
3122 if (!read_bytes (r, buffer, chunk))
3130 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3131 been replaced by LFs.
3133 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3134 files that use CR-only line ends in the file label and extra product
3137 fix_line_ends (const char *s)
3141 d = dst = xmalloc (strlen (s) + 1);
3160 read_ztrailer (struct sfm_reader *r,
3161 long long int zheader_ofs,
3162 long long int ztrailer_len);
3165 zalloc (voidpf pool_, uInt items, uInt size)
3167 struct pool *pool = pool_;
3169 return (!size || xalloc_oversized (items, size)
3171 : pool_malloc (pool, items * size));
3175 zfree (voidpf pool_, voidpf address)
3177 struct pool *pool = pool_;
3179 pool_free (pool, address);
3183 read_zheader (struct sfm_reader *r)
3186 long long int zheader_ofs;
3187 long long int ztrailer_ofs;
3188 long long int ztrailer_len;
3190 if (!read_int64 (r, &zheader_ofs)
3191 || !read_int64 (r, &ztrailer_ofs)
3192 || !read_int64 (r, &ztrailer_len))
3195 if (zheader_ofs != pos)
3197 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3198 "(expected %#llx)."),
3199 zheader_ofs, (long long int) pos);
3203 if (ztrailer_ofs < r->pos)
3205 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3210 if (ztrailer_len < 24 || ztrailer_len % 24)
3212 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3216 r->ztrailer_ofs = ztrailer_ofs;
3217 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3220 if (r->zin_buf == NULL)
3222 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3223 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3224 r->zstream.next_in = NULL;
3225 r->zstream.avail_in = 0;
3228 r->zstream.zalloc = zalloc;
3229 r->zstream.zfree = zfree;
3230 r->zstream.opaque = r->pool;
3232 return open_zstream (r);
3236 seek (struct sfm_reader *r, off_t offset)
3238 if (fseeko (r->file, offset, SEEK_SET))
3239 sys_error (r, 0, _("%s: seek failed (%s)."),
3240 fh_get_file_name (r->fh), strerror (errno));
3244 /* Performs some additional consistency checks on the ZLIB compressed data
3247 read_ztrailer (struct sfm_reader *r,
3248 long long int zheader_ofs,
3249 long long int ztrailer_len)
3251 long long int expected_uncmp_ofs;
3252 long long int expected_cmp_ofs;
3255 unsigned int block_size;
3256 unsigned int n_blocks;
3260 if (fstat (fileno (r->file), &s))
3262 sys_error (ME, 0, _("%s: stat failed (%s)."),
3263 fh_get_file_name (r->fh), strerror (errno));
3267 if (!S_ISREG (s.st_mode))
3269 /* We can't seek to the trailer and then back to the data in this file,
3270 so skip doing extra checks. */
3274 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3275 sys_warn (r, r->pos,
3276 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3277 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3279 seek (r, r->ztrailer_ofs);
3281 /* Read fixed header from ZLIB data trailer. */
3282 if (!read_int64 (r, &bias))
3284 if (-bias != r->bias)
3286 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3287 "file header bias (%.2f)."),
3292 if (!read_int64 (r, &zero))
3295 sys_warn (r, r->pos,
3296 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3298 if (!read_uint (r, &block_size))
3300 if (block_size != ZBLOCK_SIZE)
3301 sys_warn (r, r->pos,
3302 _("ZLIB trailer specifies unexpected %u-byte block size."),
3305 if (!read_uint (r, &n_blocks))
3307 if (n_blocks != (ztrailer_len - 24) / 24)
3309 sys_error (r, r->pos,
3310 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3312 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3316 expected_uncmp_ofs = zheader_ofs;
3317 expected_cmp_ofs = zheader_ofs + 24;
3318 for (i = 0; i < n_blocks; i++)
3320 off_t desc_ofs = r->pos;
3321 unsigned long long int uncompressed_ofs;
3322 unsigned long long int compressed_ofs;
3323 unsigned int uncompressed_size;
3324 unsigned int compressed_size;
3326 if (!read_uint64 (r, &uncompressed_ofs)
3327 || !read_uint64 (r, &compressed_ofs)
3328 || !read_uint (r, &uncompressed_size)
3329 || !read_uint (r, &compressed_size))
3332 if (uncompressed_ofs != expected_uncmp_ofs)
3334 sys_error (r, desc_ofs,
3335 _("ZLIB block descriptor %u reported uncompressed data "
3336 "offset %#llx, when %#llx was expected."),
3337 i, uncompressed_ofs, expected_uncmp_ofs);
3341 if (compressed_ofs != expected_cmp_ofs)
3343 sys_error (r, desc_ofs,
3344 _("ZLIB block descriptor %u reported compressed data "
3345 "offset %#llx, when %#llx was expected."),
3346 i, compressed_ofs, expected_cmp_ofs);
3350 if (i < n_blocks - 1)
3352 if (uncompressed_size != block_size)
3353 sys_warn (r, desc_ofs,
3354 _("ZLIB block descriptor %u reported block size %#x, "
3355 "when %#x was expected."),
3356 i, uncompressed_size, block_size);
3360 if (uncompressed_size > block_size)
3361 sys_warn (r, desc_ofs,
3362 _("ZLIB block descriptor %u reported block size %#x, "
3363 "when at most %#x was expected."),
3364 i, uncompressed_size, block_size);
3367 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3368 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3369 This code checks for an expansion of more than 14.3% plus 11
3371 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3373 sys_error (r, desc_ofs,
3374 _("ZLIB block descriptor %u reports compressed size %u "
3375 "and uncompressed size %u."),
3376 i, compressed_size, uncompressed_size);
3380 expected_uncmp_ofs += uncompressed_size;
3381 expected_cmp_ofs += compressed_size;
3384 if (expected_cmp_ofs != r->ztrailer_ofs)
3386 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3387 "would be expected from block descriptors."),
3388 r->ztrailer_ofs, expected_cmp_ofs);
3392 seek (r, zheader_ofs + 24);
3397 open_zstream (struct sfm_reader *r)
3401 r->zout_pos = r->zout_end = 0;
3402 error = inflateInit (&r->zstream);
3405 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3413 close_zstream (struct sfm_reader *r)
3417 error = inflateEnd (&r->zstream);
3420 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3428 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3430 uint8_t *buf = buf_;
3439 /* Use already inflated data if there is any. */
3440 if (r->zout_pos < r->zout_end)
3442 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3443 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3452 /* We need to inflate some more data.
3453 Get some more input data if we don't have any. */
3454 if (r->zstream.avail_in == 0)
3456 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3461 int retval = try_read_bytes (r, r->zin_buf, n);
3464 r->zstream.avail_in = n;
3465 r->zstream.next_in = r->zin_buf;
3469 /* Inflate the (remaining) input data. */
3470 r->zstream.avail_out = ZOUT_BUF_SIZE;
3471 r->zstream.next_out = r->zout_buf;
3472 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3474 r->zout_end = r->zstream.next_out - r->zout_buf;
3475 if (r->zout_end == 0)
3477 if (error != Z_STREAM_END)
3479 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3483 else if (!close_zstream (r) || !open_zstream (r))
3488 /* Process the output data and ignore 'error' for now. ZLIB will
3489 present it to us again on the next inflate() call. */
3495 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3497 if (r->compression == SFM_COMP_SIMPLE)
3498 return read_bytes (r, buf, byte_cnt);
3501 int retval = read_bytes_zlib (r, buf, byte_cnt);
3503 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3509 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3511 if (r->compression == SFM_COMP_SIMPLE)
3512 return try_read_bytes (r, buf, byte_cnt);
3514 return read_bytes_zlib (r, buf, byte_cnt);
3517 /* Reads a 64-bit floating-point number from R and returns its
3518 value in host format. */
3520 read_compressed_float (struct sfm_reader *r, double *d)
3524 if (!read_compressed_bytes (r, number, sizeof number))
3527 *d = float_get_double (r->float_format, number);
3531 static const struct casereader_class sys_file_casereader_class =
3533 sys_file_casereader_read,
3534 sys_file_casereader_destroy,