1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/message.h"
49 #include "libpspp/misc.h"
50 #include "libpspp/pool.h"
51 #include "libpspp/str.h"
52 #include "libpspp/stringi-set.h"
54 #include "gl/c-strtod.h"
55 #include "gl/c-ctype.h"
56 #include "gl/inttostr.h"
57 #include "gl/localcharset.h"
58 #include "gl/minmax.h"
59 #include "gl/unlocked-io.h"
60 #include "gl/xalloc.h"
61 #include "gl/xalloc-oversized.h"
65 #define _(msgid) gettext (msgid)
66 #define N_(msgid) (msgid)
70 /* subtypes 0-2 unknown */
71 EXT_INTEGER = 3, /* Machine integer info. */
72 EXT_FLOAT = 4, /* Machine floating-point info. */
73 EXT_VAR_SETS = 5, /* Variable sets. */
74 EXT_DATE = 6, /* DATE. */
75 EXT_MRSETS = 7, /* Multiple response sets. */
76 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
77 /* subtype 9 unknown */
78 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
79 EXT_DISPLAY = 11, /* Variable display parameters. */
80 /* subtype 12 unknown */
81 EXT_LONG_NAMES = 13, /* Long variable names. */
82 EXT_LONG_STRINGS = 14, /* Long strings. */
83 /* subtype 15 unknown */
84 EXT_NCASES = 16, /* Extended number of cases. */
85 EXT_FILE_ATTRS = 17, /* Data file attributes. */
86 EXT_VAR_ATTRS = 18, /* Variable attributes. */
87 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
88 EXT_ENCODING = 20, /* Character encoding. */
89 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
90 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
91 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
94 /* Fields from the top-level header record. */
95 struct sfm_header_record
97 char magic[5]; /* First 4 bytes of file, then null. */
98 int weight_idx; /* 0 if unweighted, otherwise a var index. */
99 int nominal_case_size; /* Number of var positions. */
101 /* These correspond to the members of struct sfm_file_info or a dictionary
102 but in the system file's encoding rather than ASCII. */
103 char creation_date[10]; /* "dd mmm yy". */
104 char creation_time[9]; /* "hh:mm:ss". */
105 char eye_catcher[61]; /* Eye-catcher string, then product name. */
106 char file_label[65]; /* File label. */
109 struct sfm_var_record
116 int missing_value_code;
119 struct variable *var;
122 struct sfm_value_label
128 struct sfm_value_label_record
131 struct sfm_value_label *labels;
132 unsigned int n_labels;
138 struct sfm_document_record
147 const char *name; /* Name. */
148 const char *label; /* Human-readable label for group. */
149 enum mrset_type type; /* Group type. */
150 const char **vars; /* Constituent variables' names. */
151 size_t n_vars; /* Number of constituent variables. */
154 enum mrset_md_cat_source cat_source; /* Source of category labels. */
155 bool label_from_var_label; /* 'label' taken from variable label? */
156 const char *counted; /* Counted value, as string. */
159 struct sfm_extension_record
161 int subtype; /* Record subtype. */
162 off_t pos; /* Starting offset in file. */
163 size_t size; /* Size of data elements. */
164 size_t count; /* Number of data elements. */
165 void *data; /* Contents. */
168 /* System file reader. */
171 /* Resource tracking. */
172 struct pool *pool; /* All system file state. */
175 struct sfm_read_info info;
176 struct sfm_header_record header;
177 struct sfm_var_record *vars;
179 struct sfm_value_label_record *labels;
181 struct sfm_document_record *document;
182 struct sfm_mrset *mrsets;
184 struct sfm_extension_record *extensions[32];
187 struct file_handle *fh; /* File handle. */
188 struct fh_lock *lock; /* Mutual exclusion for file handle. */
189 FILE *file; /* File stream. */
190 off_t pos; /* Position in file. */
191 bool error; /* I/O or corruption error? */
192 struct caseproto *proto; /* Format of output cases. */
195 enum integer_format integer_format; /* On-disk integer format. */
196 enum float_format float_format; /* On-disk floating point format. */
197 struct sfm_var *sfm_vars; /* Variables. */
198 size_t sfm_var_cnt; /* Number of variables. */
199 int case_cnt; /* Number of cases */
200 const char *encoding; /* String encoding. */
203 enum sfm_compression compression;
204 double bias; /* Compression bias, usually 100.0. */
205 uint8_t opcodes[8]; /* Current block of opcodes. */
206 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
207 bool corruption_warning; /* Warned about possible corruption? */
209 /* ZLIB decompression. */
210 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
211 #define ZIN_BUF_SIZE 4096
212 uint8_t *zin_buf; /* Inflation input buffer. */
213 #define ZOUT_BUF_SIZE 16384
214 uint8_t *zout_buf; /* Inflation output buffer. */
215 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
216 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
217 z_stream zstream; /* ZLIB inflater. */
220 static const struct casereader_class sys_file_casereader_class;
222 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
223 const struct sfm_var_record *,
226 static void sys_msg (struct sfm_reader *r, off_t, int class,
227 const char *format, va_list args)
228 PRINTF_FORMAT (4, 0);
229 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
230 PRINTF_FORMAT (3, 4);
231 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
232 PRINTF_FORMAT (3, 4);
234 static bool read_bytes (struct sfm_reader *, void *, size_t)
236 static int try_read_bytes (struct sfm_reader *, void *, size_t)
238 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
239 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
240 static bool read_int64 (struct sfm_reader *, long long int *)
242 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
244 static bool read_string (struct sfm_reader *, char *, size_t)
246 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
248 /* ZLIB compressed data handling. */
249 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
250 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
251 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
252 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
254 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
256 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
258 static bool read_compressed_float (struct sfm_reader *, double *)
261 static char *fix_line_ends (const char *);
263 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
264 static double parse_float (const struct sfm_reader *,
265 const void *data, size_t ofs);
267 static bool read_variable_record (struct sfm_reader *,
268 struct sfm_var_record *);
269 static bool read_value_label_record (struct sfm_reader *,
270 struct sfm_value_label_record *);
271 static struct sfm_document_record *read_document_record (struct sfm_reader *);
272 static bool read_extension_record (struct sfm_reader *, int subtype,
273 struct sfm_extension_record **);
274 static bool skip_extension_record (struct sfm_reader *, int subtype);
276 static struct text_record *open_text_record (
277 struct sfm_reader *, const struct sfm_extension_record *,
278 bool recode_to_utf8);
279 static void close_text_record (struct sfm_reader *,
280 struct text_record *);
281 static bool read_variable_to_value_pair (struct sfm_reader *,
283 struct text_record *,
284 struct variable **var, char **value);
285 static void text_warn (struct sfm_reader *r, struct text_record *text,
286 const char *format, ...)
287 PRINTF_FORMAT (3, 4);
288 static char *text_get_token (struct text_record *,
289 struct substring delimiters, char *delimiter);
290 static bool text_match (struct text_record *, char c);
291 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
292 struct text_record *,
293 struct substring delimiters,
295 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
296 struct text_record *,
297 struct substring delimiters,
299 static const char *text_parse_counted_string (struct sfm_reader *,
300 struct text_record *);
301 static size_t text_pos (const struct text_record *);
302 static const char *text_get_all (const struct text_record *);
304 /* Dictionary reader. */
312 static bool read_dictionary (struct sfm_reader *);
313 static bool read_record (struct sfm_reader *, int type,
314 size_t *allocated_vars, size_t *allocated_labels);
315 static bool read_header (struct sfm_reader *, struct sfm_read_info *,
316 struct sfm_header_record *);
317 static void parse_header (struct sfm_reader *,
318 const struct sfm_header_record *,
319 struct sfm_read_info *, struct dictionary *);
320 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
321 struct sfm_var_record *, size_t n);
322 static void parse_format_spec (struct sfm_reader *, off_t pos,
323 unsigned int format, enum which_format,
324 struct variable *, int *format_warning_cnt);
325 static void parse_document (struct dictionary *, struct sfm_document_record *);
326 static void parse_display_parameters (struct sfm_reader *,
327 const struct sfm_extension_record *,
328 struct dictionary *);
329 static bool parse_machine_integer_info (struct sfm_reader *,
330 const struct sfm_extension_record *,
331 struct sfm_read_info *);
332 static void parse_machine_float_info (struct sfm_reader *,
333 const struct sfm_extension_record *);
334 static void parse_extra_product_info (struct sfm_reader *,
335 const struct sfm_extension_record *,
336 struct sfm_read_info *);
337 static void parse_mrsets (struct sfm_reader *,
338 const struct sfm_extension_record *,
339 size_t *allocated_mrsets);
340 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
341 static void parse_long_var_name_map (struct sfm_reader *,
342 const struct sfm_extension_record *,
343 struct dictionary *);
344 static bool parse_long_string_map (struct sfm_reader *,
345 const struct sfm_extension_record *,
346 struct dictionary *);
347 static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
348 const struct sfm_var_record *,
350 const struct sfm_value_label_record *);
351 static void parse_data_file_attributes (struct sfm_reader *,
352 const struct sfm_extension_record *,
353 struct dictionary *);
354 static void parse_variable_attributes (struct sfm_reader *,
355 const struct sfm_extension_record *,
356 struct dictionary *);
357 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
358 static bool parse_long_string_value_labels (struct sfm_reader *,
359 const struct sfm_extension_record *,
360 struct dictionary *);
361 static bool parse_long_string_missing_values (
362 struct sfm_reader *, const struct sfm_extension_record *,
363 struct dictionary *);
365 /* Frees the strings inside INFO. */
367 sfm_read_info_destroy (struct sfm_read_info *info)
371 free (info->creation_date);
372 free (info->creation_time);
373 free (info->product);
374 free (info->product_ext);
378 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
379 successful, otherwise NULL. */
381 sfm_open (struct file_handle *fh)
383 size_t allocated_mrsets = 0;
384 struct sfm_reader *r;
386 /* Create and initialize reader. */
387 r = xzalloc (sizeof *r);
388 r->pool = pool_create ();
389 pool_register (r->pool, free, r);
391 r->opcode_idx = sizeof r->opcodes;
393 /* TRANSLATORS: this fragment will be interpolated into
394 messages in fh_lock() that identify types of files. */
395 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
399 r->file = fn_open (fh_get_file_name (fh), "rb");
402 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
403 fh_get_file_name (r->fh), strerror (errno));
407 if (!read_dictionary (r))
410 if (r->extensions[EXT_MRSETS] != NULL)
411 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
413 if (r->extensions[EXT_MRSETS2] != NULL)
414 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
423 read_dictionary (struct sfm_reader *r)
425 size_t allocated_vars;
426 size_t allocated_labels;
428 if (!read_header (r, &r->info, &r->header))
432 allocated_labels = 0;
437 if (!read_int (r, &type))
441 if (!read_record (r, type, &allocated_vars, &allocated_labels))
445 if (!skip_bytes (r, 4))
448 if (r->compression == SFM_COMP_ZLIB && !read_zheader (r))
455 read_record (struct sfm_reader *r, int type,
456 size_t *allocated_vars, size_t *allocated_labels)
463 if (r->n_vars >= *allocated_vars)
464 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
466 return read_variable_record (r, &r->vars[r->n_vars++]);
469 if (r->n_labels >= *allocated_labels)
470 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
472 return read_value_label_record (r, &r->labels[r->n_labels++]);
475 /* A Type 4 record is always immediately after a type 3 record,
476 so the code for type 3 records reads the type 4 record too. */
477 sys_error (r, r->pos, _("Misplaced type 4 record."));
481 if (r->document != NULL)
483 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
486 r->document = read_document_record (r);
487 return r->document != NULL;
490 if (!read_int (r, &subtype))
493 || subtype >= sizeof r->extensions / sizeof *r->extensions)
496 _("Unrecognized record type 7, subtype %d. Please "
497 "send a copy of this file, and the syntax which "
498 "created it to %s."),
499 subtype, PACKAGE_BUGREPORT);
500 return skip_extension_record (r, subtype);
502 else if (r->extensions[subtype] != NULL)
505 _("Record type 7, subtype %d found here has the same "
506 "type as the record found near offset 0x%llx. "
507 "Please send a copy of this file, and the syntax "
508 "which created it to %s."),
509 subtype, (long long int) r->extensions[subtype]->pos,
511 return skip_extension_record (r, subtype);
514 return read_extension_record (r, subtype, &r->extensions[subtype]);
517 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
524 /* Returns the character encoding obtained from R, or a null pointer if R
525 doesn't have an indication of its character encoding. */
527 sfm_get_encoding (const struct sfm_reader *r)
529 /* The EXT_ENCODING record is the best way to determine dictionary
531 if (r->extensions[EXT_ENCODING])
532 return r->extensions[EXT_ENCODING]->data;
534 /* But EXT_INTEGER is better than nothing as a fallback. */
535 if (r->extensions[EXT_INTEGER])
537 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
538 const char *encoding;
547 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
548 respectively. However, many files have character code 2 but data
549 which are clearly not ASCII. Therefore, ignore these values. */
556 encoding = sys_get_encoding_from_codepage (codepage);
557 if (encoding != NULL)
563 /* If the file magic number is EBCDIC then its character data is too. */
564 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
570 /* Decodes the dictionary read from R, saving it into into *DICT. Character
571 strings in R are decoded using ENCODING, or an encoding obtained from R if
572 ENCODING is null, or the locale encoding if R specifies no encoding.
574 If INFOP is non-null, then it receives additional info about the system
575 file, which the caller must eventually free with sfm_read_info_destroy()
576 when it is no longer needed.
578 This function consumes R. The caller must use it again later, even to
579 destroy it with sfm_close(). */
581 sfm_decode (struct sfm_reader *r, const char *encoding,
582 struct dictionary **dictp, struct sfm_read_info *infop)
584 struct dictionary *dict;
587 if (encoding == NULL)
589 encoding = sfm_get_encoding (r);
590 if (encoding == NULL)
591 encoding = locale_charset ();
594 dict = dict_create (encoding);
595 r->encoding = dict_get_encoding (dict);
597 /* These records don't use variables at all. */
598 if (r->document != NULL)
599 parse_document (dict, r->document);
601 if (r->extensions[EXT_INTEGER] != NULL
602 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
605 if (r->extensions[EXT_FLOAT] != NULL)
606 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
608 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
609 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
611 if (r->extensions[EXT_FILE_ATTRS] != NULL)
612 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
614 parse_header (r, &r->header, &r->info, dict);
616 /* Parse the variable records, the basis of almost everything else. */
617 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
620 /* Parse value labels and the weight variable immediately after the variable
621 records. These records use indexes into var_recs[], so we must parse them
622 before those indexes become invalidated by very long string variables. */
623 for (i = 0; i < r->n_labels; i++)
624 if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i]))
626 if (r->header.weight_idx != 0)
628 struct variable *weight_var;
630 weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars,
631 r->header.weight_idx);
632 if (weight_var != NULL)
634 if (var_is_numeric (weight_var))
635 dict_set_weight (dict, weight_var);
637 sys_warn (r, -1, _("Ignoring string variable `%s' set "
638 "as weighting variable."),
639 var_get_name (weight_var));
643 if (r->extensions[EXT_DISPLAY] != NULL)
644 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
646 /* The following records use short names, so they need to be parsed before
647 parse_long_var_name_map() changes short names to long names. */
648 decode_mrsets (r, dict);
650 if (r->extensions[EXT_LONG_STRINGS] != NULL
651 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
654 /* Now rename variables to their long names. */
655 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
657 /* The following records use long names, so they need to follow renaming. */
658 if (r->extensions[EXT_VAR_ATTRS] != NULL)
660 parse_variable_attributes (r, r->extensions[EXT_VAR_ATTRS], dict);
662 /* Roles use the $@Role attribute. */
663 assign_variable_roles (r, dict);
666 if (r->extensions[EXT_LONG_LABELS] != NULL
667 && !parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS],
670 if (r->extensions[EXT_LONG_MISSING] != NULL
671 && !parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
675 /* Warn if the actual amount of data per case differs from the
676 amount that the header claims. SPSS version 13 gets this
677 wrong when very long strings are involved, so don't warn in
679 if (r->header.nominal_case_size != -1
680 && r->header.nominal_case_size != r->n_vars
681 && r->info.version_major != 13)
682 sys_warn (r, -1, _("File header claims %d variable positions but "
683 "%zu were read from file."),
684 r->header.nominal_case_size, r->n_vars);
686 /* Create an index of dictionary variable widths for
687 sfm_read_case to use. We cannot use the `struct variable's
688 from the dictionary we created, because the caller owns the
689 dictionary and may destroy or modify its variables. */
690 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
691 pool_register (r->pool, free, r->sfm_vars);
692 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
698 memset (&r->info, 0, sizeof r->info);
701 return casereader_create_sequential
703 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
704 &sys_file_casereader_class, r);
713 /* Closes R, which should have been returned by sfm_open() but not already
714 closed with sfm_decode() or this function.
715 Returns true if an I/O error has occurred on READER, false
718 sfm_close (struct sfm_reader *r)
727 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
729 msg (ME, _("Error closing system file `%s': %s."),
730 fh_get_file_name (r->fh), strerror (errno));
736 sfm_read_info_destroy (&r->info);
741 pool_destroy (r->pool);
746 /* Destroys READER. */
748 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
750 struct sfm_reader *r = r_;
754 /* Returns true if FILE is an SPSS system file,
757 sfm_detect (FILE *file)
761 if (fread (magic, 4, 1, file) != 1)
765 return (!strcmp (ASCII_MAGIC, magic)
766 || !strcmp (ASCII_ZMAGIC, magic)
767 || !strcmp (EBCDIC_MAGIC, magic));
770 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
771 except for the string fields in *INFO, which parse_header() will initialize
772 later once the file's encoding is known. */
774 read_header (struct sfm_reader *r, struct sfm_read_info *info,
775 struct sfm_header_record *header)
777 uint8_t raw_layout_code[4];
782 if (!read_string (r, header->magic, sizeof header->magic)
783 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
786 if (!strcmp (ASCII_MAGIC, header->magic)
787 || !strcmp (EBCDIC_MAGIC, header->magic))
789 else if (!strcmp (ASCII_ZMAGIC, header->magic))
793 sys_error (r, 0, _("This is not an SPSS system file."));
797 /* Identify integer format. */
798 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
800 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
802 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
804 || (r->integer_format != INTEGER_MSB_FIRST
805 && r->integer_format != INTEGER_LSB_FIRST))
807 sys_error (r, 64, _("This is not an SPSS system file."));
811 if (!read_int (r, &header->nominal_case_size))
814 if (header->nominal_case_size < 0
815 || header->nominal_case_size > INT_MAX / 16)
816 header->nominal_case_size = -1;
818 if (!read_int (r, &compressed))
823 r->compression = SFM_COMP_NONE;
824 else if (compressed == 1)
825 r->compression = SFM_COMP_SIMPLE;
826 else if (compressed != 0)
828 sys_error (r, 0, "System file header has invalid compression "
829 "value %d.", compressed);
836 r->compression = SFM_COMP_ZLIB;
839 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
840 "compression value %d.", compressed);
845 if (!read_int (r, &header->weight_idx))
848 if (!read_int (r, &r->case_cnt))
850 if ( r->case_cnt > INT_MAX / 2)
853 /* Identify floating-point format and obtain compression bias. */
854 if (!read_bytes (r, raw_bias, sizeof raw_bias))
856 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
858 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
860 if (memcmp (raw_bias, zero_bias, 8))
861 sys_warn (r, r->pos - 8,
862 _("Compression bias is not the usual "
863 "value of 100, or system file uses unrecognized "
864 "floating-point format."));
867 /* Some software is known to write all-zeros to this
868 field. Such software also writes floating-point
869 numbers in the format that we expect by default
870 (it seems that all software most likely does, in
871 reality), so don't warn in this case. */
874 if (r->integer_format == INTEGER_MSB_FIRST)
875 r->float_format = FLOAT_IEEE_DOUBLE_BE;
877 r->float_format = FLOAT_IEEE_DOUBLE_LE;
879 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
881 if (!read_string (r, header->creation_date, sizeof header->creation_date)
882 || !read_string (r, header->creation_time, sizeof header->creation_time)
883 || !read_string (r, header->file_label, sizeof header->file_label)
884 || !skip_bytes (r, 3))
887 info->integer_format = r->integer_format;
888 info->float_format = r->float_format;
889 info->compression = r->compression;
890 info->case_cnt = r->case_cnt;
895 /* Reads a variable (type 2) record from R into RECORD. */
897 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
899 int has_variable_label;
901 memset (record, 0, sizeof *record);
903 record->pos = r->pos;
904 if (!read_int (r, &record->width)
905 || !read_int (r, &has_variable_label)
906 || !read_int (r, &record->missing_value_code)
907 || !read_int (r, &record->print_format)
908 || !read_int (r, &record->write_format)
909 || !read_bytes (r, record->name, sizeof record->name))
912 if (has_variable_label == 1)
914 enum { MAX_LABEL_LEN = 255 };
915 unsigned int len, read_len;
917 if (!read_uint (r, &len))
920 /* Read up to MAX_LABEL_LEN bytes of label. */
921 read_len = MIN (MAX_LABEL_LEN, len);
922 record->label = pool_malloc (r->pool, read_len + 1);
923 if (!read_string (r, record->label, read_len + 1))
926 /* Skip unread label bytes. */
927 if (!skip_bytes (r, len - read_len))
930 /* Skip label padding up to multiple of 4 bytes. */
931 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
934 else if (has_variable_label != 0)
936 sys_error (r, record->pos,
937 _("Variable label indicator field is not 0 or 1."));
941 /* Set missing values. */
942 if (record->missing_value_code != 0)
944 int code = record->missing_value_code;
945 if (record->width == 0)
947 if (code < -3 || code > 3 || code == -1)
949 sys_error (r, record->pos,
950 _("Numeric missing value indicator field is not "
951 "-3, -2, 0, 1, 2, or 3."));
957 if (code < 1 || code > 3)
959 sys_error (r, record->pos,
960 _("String missing value indicator field is not "
966 if (!read_bytes (r, record->missing, 8 * abs (code)))
973 /* Reads value labels from R into RECORD. */
975 read_value_label_record (struct sfm_reader *r,
976 struct sfm_value_label_record *record)
981 /* Read type 3 record. */
982 record->pos = r->pos;
983 if (!read_uint (r, &record->n_labels))
985 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
987 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
991 record->labels = pool_nmalloc (r->pool, record->n_labels,
992 sizeof *record->labels);
993 for (i = 0; i < record->n_labels; i++)
995 struct sfm_value_label *label = &record->labels[i];
996 unsigned char label_len;
999 if (!read_bytes (r, label->value, sizeof label->value))
1002 /* Read label length. */
1003 if (!read_bytes (r, &label_len, sizeof label_len))
1005 padded_len = ROUND_UP (label_len + 1, 8);
1007 /* Read label, padding. */
1008 label->label = pool_malloc (r->pool, padded_len + 1);
1009 if (!read_bytes (r, label->label, padded_len - 1))
1011 label->label[label_len] = '\0';
1014 /* Read record type of type 4 record. */
1015 if (!read_int (r, &type))
1019 sys_error (r, r->pos - 4,
1020 _("Variable index record (type 4) does not immediately "
1021 "follow value label record (type 3) as it should."));
1025 /* Read number of variables associated with value label from type 4
1027 if (!read_uint (r, &record->n_vars))
1029 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1031 sys_error (r, r->pos - 4,
1032 _("Number of variables associated with a value label (%zu) "
1033 "is not between 1 and the number of variables (%zu)."),
1034 record->n_vars, r->n_vars);
1038 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1039 for (i = 0; i < record->n_vars; i++)
1040 if (!read_int (r, &record->vars[i]))
1046 /* Reads a document record from R and returns it. */
1047 static struct sfm_document_record *
1048 read_document_record (struct sfm_reader *r)
1050 struct sfm_document_record *record;
1053 record = pool_malloc (r->pool, sizeof *record);
1054 record->pos = r->pos;
1056 if (!read_int (r, &n_lines))
1058 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1060 sys_error (r, record->pos,
1061 _("Number of document lines (%d) "
1062 "must be greater than 0 and less than %d."),
1063 n_lines, INT_MAX / DOC_LINE_LENGTH);
1067 record->n_lines = n_lines;
1068 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1069 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1076 read_extension_record_header (struct sfm_reader *r, int subtype,
1077 struct sfm_extension_record *record)
1079 record->subtype = subtype;
1080 record->pos = r->pos;
1081 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1084 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1085 allows an extra byte for a null terminator, used by some
1086 extension processing routines. */
1087 if (record->size != 0
1088 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1090 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1098 /* Reads an extension record from R into RECORD. */
1100 read_extension_record (struct sfm_reader *r, int subtype,
1101 struct sfm_extension_record **recordp)
1103 struct extension_record_type
1110 static const struct extension_record_type types[] =
1112 /* Implemented record types. */
1113 { EXT_INTEGER, 4, 8 },
1114 { EXT_FLOAT, 8, 3 },
1115 { EXT_MRSETS, 1, 0 },
1116 { EXT_PRODUCT_INFO, 1, 0 },
1117 { EXT_DISPLAY, 4, 0 },
1118 { EXT_LONG_NAMES, 1, 0 },
1119 { EXT_LONG_STRINGS, 1, 0 },
1120 { EXT_NCASES, 8, 2 },
1121 { EXT_FILE_ATTRS, 1, 0 },
1122 { EXT_VAR_ATTRS, 1, 0 },
1123 { EXT_MRSETS2, 1, 0 },
1124 { EXT_ENCODING, 1, 0 },
1125 { EXT_LONG_LABELS, 1, 0 },
1126 { EXT_LONG_MISSING, 1, 0 },
1128 /* Ignored record types. */
1129 { EXT_VAR_SETS, 0, 0 },
1131 { EXT_DATA_ENTRY, 0, 0 },
1132 { EXT_DATAVIEW, 0, 0 },
1135 const struct extension_record_type *type;
1136 struct sfm_extension_record *record;
1140 record = pool_malloc (r->pool, sizeof *record);
1141 if (!read_extension_record_header (r, subtype, record))
1143 n_bytes = record->count * record->size;
1145 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1146 if (subtype == type->subtype)
1148 if (type->size > 0 && record->size != type->size)
1149 sys_warn (r, record->pos,
1150 _("Record type 7, subtype %d has bad size %zu "
1151 "(expected %d)."), subtype, record->size, type->size);
1152 else if (type->count > 0 && record->count != type->count)
1153 sys_warn (r, record->pos,
1154 _("Record type 7, subtype %d has bad count %zu "
1155 "(expected %d)."), subtype, record->count, type->count);
1156 else if (type->count == 0 && type->size == 0)
1158 /* Ignore this record. */
1162 char *data = pool_malloc (r->pool, n_bytes + 1);
1163 data[n_bytes] = '\0';
1165 record->data = data;
1166 if (!read_bytes (r, record->data, n_bytes))
1175 sys_warn (r, record->pos,
1176 _("Unrecognized record type 7, subtype %d. Please send a "
1177 "copy of this file, and the syntax which created it to %s."),
1178 subtype, PACKAGE_BUGREPORT);
1181 return skip_bytes (r, n_bytes);
1185 skip_extension_record (struct sfm_reader *r, int subtype)
1187 struct sfm_extension_record record;
1189 return (read_extension_record_header (r, subtype, &record)
1190 && skip_bytes (r, record.count * record.size));
1194 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1195 struct sfm_read_info *info, struct dictionary *dict)
1197 const char *dict_encoding = dict_get_encoding (dict);
1198 struct substring product;
1199 struct substring label;
1202 /* Convert file label to UTF-8 and put it into DICT. */
1203 label = recode_substring_pool ("UTF-8", dict_encoding,
1204 ss_cstr (header->file_label), r->pool);
1205 ss_trim (&label, ss_cstr (" "));
1206 label.string[label.length] = '\0';
1207 fixed_label = fix_line_ends (label.string);
1208 dict_set_label (dict, fixed_label);
1211 /* Put creation date and time in UTF-8 into INFO. */
1212 info->creation_date = recode_string ("UTF-8", dict_encoding,
1213 header->creation_date, -1);
1214 info->creation_time = recode_string ("UTF-8", dict_encoding,
1215 header->creation_time, -1);
1217 /* Put product name into INFO, dropping eye-catcher string if present. */
1218 product = recode_substring_pool ("UTF-8", dict_encoding,
1219 ss_cstr (header->eye_catcher), r->pool);
1220 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1221 ss_trim (&product, ss_cstr (" "));
1222 info->product = ss_xstrdup (product);
1225 /* Reads a variable (type 2) record from R and adds the
1226 corresponding variable to DICT.
1227 Also skips past additional variable records for long string
1230 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1231 struct sfm_var_record *var_recs, size_t n_var_recs)
1233 const char *dict_encoding = dict_get_encoding (dict);
1234 struct sfm_var_record *rec;
1237 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1239 struct variable *var;
1244 name = recode_string_pool ("UTF-8", dict_encoding,
1245 rec->name, 8, r->pool);
1246 name[strcspn (name, " ")] = '\0';
1248 if (!dict_id_is_valid (dict, name, false)
1249 || name[0] == '$' || name[0] == '#')
1251 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1255 if (rec->width < 0 || rec->width > 255)
1257 sys_error (r, rec->pos,
1258 _("Bad width %d for variable %s."), rec->width, name);
1262 var = rec->var = dict_create_var (dict, name, rec->width);
1265 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1266 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1269 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1273 /* Set the short name the same as the long name. */
1274 var_set_short_name (var, 0, name);
1276 /* Get variable label, if any. */
1281 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1282 rec->label, -1, r->pool);
1283 var_set_label (var, utf8_label, false);
1286 /* Set missing values. */
1287 if (rec->missing_value_code != 0)
1289 int width = var_get_width (var);
1290 struct missing_values mv;
1292 mv_init_pool (r->pool, &mv, width);
1293 if (var_is_numeric (var))
1295 bool has_range = rec->missing_value_code < 0;
1296 int n_discrete = (has_range
1297 ? rec->missing_value_code == -3
1298 : rec->missing_value_code);
1303 double low = parse_float (r, rec->missing, 0);
1304 double high = parse_float (r, rec->missing, 8);
1306 /* Deal with SPSS 21 change in representation. */
1310 mv_add_range (&mv, low, high);
1314 for (i = 0; i < n_discrete; i++)
1316 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1324 value_init_pool (r->pool, &value, width);
1325 value_set_missing (&value, width);
1326 for (i = 0; i < rec->missing_value_code; i++)
1327 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1329 var_set_missing_values (var, &mv);
1333 parse_format_spec (r, rec->pos + 12, rec->print_format,
1334 PRINT_FORMAT, var, &n_warnings);
1335 parse_format_spec (r, rec->pos + 16, rec->write_format,
1336 WRITE_FORMAT, var, &n_warnings);
1338 /* Account for values.
1339 Skip long string continuation records, if any. */
1340 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1341 for (i = 1; i < n_values; i++)
1342 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1344 sys_error (r, rec->pos, _("Missing string continuation record."));
1353 /* Translates the format spec from sysfile format to internal
1356 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1357 enum which_format which, struct variable *v,
1360 const int max_warnings = 8;
1361 uint8_t raw_type = format >> 16;
1362 uint8_t w = format >> 8;
1371 ok = (fmt_from_io (raw_type, &f.type)
1372 && fmt_check_output (&f)
1373 && fmt_check_width_compat (&f, var_get_width (v)));
1378 if (which == PRINT_FORMAT)
1379 var_set_print_format (v, &f);
1381 var_set_write_format (v, &f);
1383 else if (format == 0)
1385 /* Actually observed in the wild. No point in warning about it. */
1387 else if (++*n_warnings <= max_warnings)
1389 if (which == PRINT_FORMAT)
1390 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1392 var_get_name (v), var_get_width (v), format);
1394 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1396 var_get_name (v), var_get_width (v), format);
1398 if (*n_warnings == max_warnings)
1399 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1404 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1408 for (p = record->documents;
1409 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1410 p += DOC_LINE_LENGTH)
1412 struct substring line;
1414 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1415 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1416 ss_rtrim (&line, ss_cstr (" "));
1417 line.string[line.length] = '\0';
1419 dict_add_document_line (dict, line.string, false);
1425 /* Parses record type 7, subtype 3. */
1427 parse_machine_integer_info (struct sfm_reader *r,
1428 const struct sfm_extension_record *record,
1429 struct sfm_read_info *info)
1431 int float_representation, expected_float_format;
1432 int integer_representation, expected_integer_format;
1434 /* Save version info. */
1435 info->version_major = parse_int (r, record->data, 0);
1436 info->version_minor = parse_int (r, record->data, 4);
1437 info->version_revision = parse_int (r, record->data, 8);
1439 /* Check floating point format. */
1440 float_representation = parse_int (r, record->data, 16);
1441 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1442 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1443 expected_float_format = 1;
1444 else if (r->float_format == FLOAT_Z_LONG)
1445 expected_float_format = 2;
1446 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1447 expected_float_format = 3;
1450 if (float_representation != expected_float_format)
1452 sys_error (r, record->pos,
1453 _("Floating-point representation indicated by "
1454 "system file (%d) differs from expected (%d)."),
1455 float_representation, expected_float_format);
1459 /* Check integer format. */
1460 integer_representation = parse_int (r, record->data, 24);
1461 if (r->integer_format == INTEGER_MSB_FIRST)
1462 expected_integer_format = 1;
1463 else if (r->integer_format == INTEGER_LSB_FIRST)
1464 expected_integer_format = 2;
1467 if (integer_representation != expected_integer_format)
1468 sys_warn (r, record->pos,
1469 _("Integer format indicated by system file (%d) "
1470 "differs from expected (%d)."),
1471 integer_representation, expected_integer_format);
1476 /* Parses record type 7, subtype 4. */
1478 parse_machine_float_info (struct sfm_reader *r,
1479 const struct sfm_extension_record *record)
1481 double sysmis = parse_float (r, record->data, 0);
1482 double highest = parse_float (r, record->data, 8);
1483 double lowest = parse_float (r, record->data, 16);
1485 if (sysmis != SYSMIS)
1486 sys_warn (r, record->pos,
1487 _("File specifies unexpected value %g (%a) as %s, "
1488 "instead of %g (%a)."),
1489 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1491 if (highest != HIGHEST)
1492 sys_warn (r, record->pos,
1493 _("File specifies unexpected value %g (%a) as %s, "
1494 "instead of %g (%a)."),
1495 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1497 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1498 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1499 appears in a context (missing values) where SYSMIS cannot. */
1500 if (lowest != LOWEST && lowest != SYSMIS)
1501 sys_warn (r, record->pos,
1502 _("File specifies unexpected value %g (%a) as %s, "
1503 "instead of %g (%a) or %g (%a)."),
1504 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1507 /* Parses record type 7, subtype 10. */
1509 parse_extra_product_info (struct sfm_reader *r,
1510 const struct sfm_extension_record *record,
1511 struct sfm_read_info *info)
1513 struct text_record *text;
1515 text = open_text_record (r, record, true);
1516 info->product_ext = fix_line_ends (text_get_all (text));
1517 close_text_record (r, text);
1520 /* Parses record type 7, subtype 7 or 19. */
1522 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1523 size_t *allocated_mrsets)
1525 struct text_record *text;
1527 text = open_text_record (r, record, false);
1530 struct sfm_mrset *mrset;
1531 size_t allocated_vars;
1534 /* Skip extra line feeds if present. */
1535 while (text_match (text, '\n'))
1538 if (r->n_mrsets >= *allocated_mrsets)
1539 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1541 mrset = &r->mrsets[r->n_mrsets];
1542 memset(mrset, 0, sizeof *mrset);
1544 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1545 if (mrset->name == NULL)
1548 if (text_match (text, 'C'))
1550 mrset->type = MRSET_MC;
1551 if (!text_match (text, ' '))
1553 sys_warn (r, record->pos,
1554 _("Missing space following `%c' at offset %zu "
1555 "in MRSETS record."), 'C', text_pos (text));
1559 else if (text_match (text, 'D'))
1561 mrset->type = MRSET_MD;
1562 mrset->cat_source = MRSET_VARLABELS;
1564 else if (text_match (text, 'E'))
1568 mrset->type = MRSET_MD;
1569 mrset->cat_source = MRSET_COUNTEDVALUES;
1570 if (!text_match (text, ' '))
1572 sys_warn (r, record->pos,
1573 _("Missing space following `%c' at offset %zu "
1574 "in MRSETS record."), 'E', text_pos (text));
1578 number = text_get_token (text, ss_cstr (" "), NULL);
1579 if (!strcmp (number, "11"))
1580 mrset->label_from_var_label = true;
1581 else if (strcmp (number, "1"))
1582 sys_warn (r, record->pos,
1583 _("Unexpected label source value following `E' "
1584 "at offset %zu in MRSETS record."),
1589 sys_warn (r, record->pos,
1590 _("Missing `C', `D', or `E' at offset %zu "
1591 "in MRSETS record."),
1596 if (mrset->type == MRSET_MD)
1598 mrset->counted = text_parse_counted_string (r, text);
1599 if (mrset->counted == NULL)
1603 mrset->label = text_parse_counted_string (r, text);
1604 if (mrset->label == NULL)
1612 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1615 if (delimiter != '\n')
1616 sys_warn (r, record->pos,
1617 _("Missing new-line parsing variable names "
1618 "at offset %zu in MRSETS record."),
1623 if (mrset->n_vars >= allocated_vars)
1624 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1626 sizeof *mrset->vars);
1627 mrset->vars[mrset->n_vars++] = var;
1629 while (delimiter != '\n');
1633 close_text_record (r, text);
1637 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1639 const struct sfm_mrset *s;
1641 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1643 struct stringi_set var_names;
1644 struct mrset *mrset;
1649 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1652 sys_warn (r, -1, _("Multiple response set name `%s' does not begin "
1659 mrset = xzalloc (sizeof *mrset);
1661 mrset->type = s->type;
1662 mrset->cat_source = s->cat_source;
1663 mrset->label_from_var_label = s->label_from_var_label;
1664 if (s->label[0] != '\0')
1665 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1667 stringi_set_init (&var_names);
1668 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1670 for (i = 0; i < s->n_vars; i++)
1672 struct variable *var;
1675 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1677 var = dict_lookup_var (dict, var_name);
1683 if (!stringi_set_insert (&var_names, var_name))
1686 _("MRSET %s contains duplicate variable name %s."),
1687 mrset->name, var_name);
1693 if (mrset->label == NULL && mrset->label_from_var_label
1694 && var_has_label (var))
1695 mrset->label = xstrdup (var_get_label (var));
1698 && var_get_type (var) != var_get_type (mrset->vars[0]))
1701 _("MRSET %s contains both string and "
1702 "numeric variables."), mrset->name);
1705 width = MIN (width, var_get_width (var));
1707 mrset->vars[mrset->n_vars++] = var;
1710 if (mrset->n_vars < 2)
1712 if (mrset->n_vars == 0)
1713 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1715 sys_warn (r, -1, _("MRSET %s has only one variable."),
1717 mrset_destroy (mrset);
1718 stringi_set_destroy (&var_names);
1722 if (mrset->type == MRSET_MD)
1724 mrset->width = width;
1725 value_init (&mrset->counted, width);
1727 mrset->counted.f = c_strtod (s->counted, NULL);
1729 value_copy_str_rpad (&mrset->counted, width,
1730 (const uint8_t *) s->counted, ' ');
1733 dict_add_mrset (dict, mrset);
1734 stringi_set_destroy (&var_names);
1738 /* Read record type 7, subtype 11, which specifies how variables
1739 should be displayed in GUI environments. */
1741 parse_display_parameters (struct sfm_reader *r,
1742 const struct sfm_extension_record *record,
1743 struct dictionary *dict)
1745 bool includes_width;
1746 bool warned = false;
1751 n_vars = dict_get_var_cnt (dict);
1752 if (record->count == 3 * n_vars)
1753 includes_width = true;
1754 else if (record->count == 2 * n_vars)
1755 includes_width = false;
1758 sys_warn (r, record->pos,
1759 _("Extension 11 has bad count %zu (for %zu variables)."),
1760 record->count, n_vars);
1765 for (i = 0; i < n_vars; ++i)
1767 struct variable *v = dict_get_var (dict, i);
1768 int measure, width, align;
1770 measure = parse_int (r, record->data, ofs);
1775 width = parse_int (r, record->data, ofs);
1781 align = parse_int (r, record->data, ofs);
1784 /* SPSS sometimes seems to set variables' measure to zero. */
1788 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1791 sys_warn (r, record->pos,
1792 _("Invalid variable display parameters for variable "
1793 "%zu (%s). Default parameters substituted."),
1794 i, var_get_name (v));
1799 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1800 : measure == 2 ? MEASURE_ORDINAL
1802 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1803 : align == 1 ? ALIGN_RIGHT
1806 /* Older versions (SPSS 9.0) sometimes set the display
1807 width to zero. This causes confusion in the GUI, so
1808 only set the width if it is nonzero. */
1810 var_set_display_width (v, width);
1815 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1816 const char *new_name)
1818 size_t n_short_names;
1822 /* Renaming a variable may clear its short names, but we
1823 want to retain them, so we save them and re-set them
1825 n_short_names = var_get_short_name_cnt (var);
1826 short_names = xnmalloc (n_short_names, sizeof *short_names);
1827 for (i = 0; i < n_short_names; i++)
1829 const char *s = var_get_short_name (var, i);
1830 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1833 /* Set long name. */
1834 dict_rename_var (dict, var, new_name);
1836 /* Restore short names. */
1837 for (i = 0; i < n_short_names; i++)
1839 var_set_short_name (var, i, short_names[i]);
1840 free (short_names[i]);
1845 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1846 to each short name. Modifies variable names in DICT accordingly. */
1848 parse_long_var_name_map (struct sfm_reader *r,
1849 const struct sfm_extension_record *record,
1850 struct dictionary *dict)
1852 struct text_record *text;
1853 struct variable *var;
1858 /* There are no long variable names. Use the short variable names,
1859 converted to lowercase, as the long variable names. */
1862 for (i = 0; i < dict_get_var_cnt (dict); i++)
1864 struct variable *var = dict_get_var (dict, i);
1867 new_name = utf8_to_lower (var_get_name (var));
1868 rename_var_and_save_short_names (dict, var, new_name);
1875 /* Rename each of the variables, one by one. (In a correctly constructed
1876 system file, this cannot create any intermediate duplicate variable names,
1877 because all of the new variable names are longer than any of the old
1878 variable names and thus there cannot be any overlaps.) */
1879 text = open_text_record (r, record, true);
1880 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1882 /* Validate long name. */
1883 if (!dict_id_is_valid (dict, long_name, false))
1885 sys_warn (r, record->pos,
1886 _("Long variable mapping from %s to invalid "
1887 "variable name `%s'."),
1888 var_get_name (var), long_name);
1892 /* Identify any duplicates. */
1893 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1894 && dict_lookup_var (dict, long_name) != NULL)
1896 sys_warn (r, record->pos,
1897 _("Duplicate long variable name `%s'."), long_name);
1901 rename_var_and_save_short_names (dict, var, long_name);
1903 close_text_record (r, text);
1906 /* Reads record type 7, subtype 14, which gives the real length
1907 of each very long string. Rearranges DICT accordingly. */
1909 parse_long_string_map (struct sfm_reader *r,
1910 const struct sfm_extension_record *record,
1911 struct dictionary *dict)
1913 struct text_record *text;
1914 struct variable *var;
1917 text = open_text_record (r, record, true);
1918 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1920 size_t idx = var_get_dict_index (var);
1926 length = strtol (length_s, NULL, 10);
1927 if (length < 1 || length > MAX_STRING)
1929 sys_warn (r, record->pos,
1930 _("%s listed as string of invalid length %s "
1931 "in very long string record."),
1932 var_get_name (var), length_s);
1936 /* Check segments. */
1937 segment_cnt = sfm_width_to_segments (length);
1938 if (segment_cnt == 1)
1940 sys_warn (r, record->pos,
1941 _("%s listed in very long string record with width %s, "
1942 "which requires only one segment."),
1943 var_get_name (var), length_s);
1946 if (idx + segment_cnt > dict_get_var_cnt (dict))
1948 sys_error (r, record->pos,
1949 _("Very long string %s overflows dictionary."),
1950 var_get_name (var));
1954 /* Get the short names from the segments and check their
1956 for (i = 0; i < segment_cnt; i++)
1958 struct variable *seg = dict_get_var (dict, idx + i);
1959 int alloc_width = sfm_segment_alloc_width (length, i);
1960 int width = var_get_width (seg);
1963 var_set_short_name (var, i, var_get_short_name (seg, 0));
1964 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1966 sys_error (r, record->pos,
1967 _("Very long string with width %ld has segment %d "
1968 "of width %d (expected %d)."),
1969 length, i, width, alloc_width);
1973 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1974 var_set_width (var, length);
1976 close_text_record (r, text);
1977 dict_compact_values (dict);
1983 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1984 const struct sfm_var_record *var_recs, size_t n_var_recs,
1985 const struct sfm_value_label_record *record)
1987 struct variable **vars;
1991 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
1992 for (i = 0; i < record->n_labels; i++)
1993 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1994 record->labels[i].label, -1,
1997 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1998 for (i = 0; i < record->n_vars; i++)
2000 vars[i] = lookup_var_by_index (r, record->pos,
2001 var_recs, n_var_recs, record->vars[i]);
2002 if (vars[i] == NULL)
2006 for (i = 1; i < record->n_vars; i++)
2007 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2009 sys_error (r, record->pos,
2010 _("Variables associated with value label are not all of "
2011 "identical type. Variable %s is %s, but variable "
2013 var_get_name (vars[0]),
2014 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2015 var_get_name (vars[i]),
2016 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2020 for (i = 0; i < record->n_vars; i++)
2022 struct variable *var = vars[i];
2026 width = var_get_width (var);
2029 sys_error (r, record->pos,
2030 _("Value labels may not be added to long string "
2031 "variables (e.g. %s) using records types 3 and 4."),
2032 var_get_name (var));
2036 for (j = 0; j < record->n_labels; j++)
2038 struct sfm_value_label *label = &record->labels[j];
2041 value_init (&value, width);
2043 value.f = parse_float (r, label->value, 0);
2045 memcpy (value_str_rw (&value, width), label->value, width);
2047 if (!var_add_value_label (var, &value, utf8_labels[j]))
2049 if (var_is_numeric (var))
2050 sys_warn (r, record->pos,
2051 _("Duplicate value label for %g on %s."),
2052 value.f, var_get_name (var));
2054 sys_warn (r, record->pos,
2055 _("Duplicate value label for `%.*s' on %s."),
2056 width, value_str (&value, width),
2057 var_get_name (var));
2060 value_destroy (&value, width);
2064 pool_free (r->pool, vars);
2065 for (i = 0; i < record->n_labels; i++)
2066 pool_free (r->pool, utf8_labels[i]);
2067 pool_free (r->pool, utf8_labels);
2072 static struct variable *
2073 lookup_var_by_index (struct sfm_reader *r, off_t offset,
2074 const struct sfm_var_record *var_recs, size_t n_var_recs,
2077 const struct sfm_var_record *rec;
2079 if (idx < 1 || idx > n_var_recs)
2081 sys_error (r, offset,
2082 _("Variable index %d not in valid range 1...%zu."),
2087 rec = &var_recs[idx - 1];
2088 if (rec->var == NULL)
2090 sys_error (r, offset,
2091 _("Variable index %d refers to long string continuation."),
2099 /* Parses a set of custom attributes from TEXT into ATTRS.
2100 ATTRS may be a null pointer, in which case the attributes are
2101 read but discarded. */
2103 parse_attributes (struct sfm_reader *r, struct text_record *text,
2104 struct attrset *attrs)
2108 struct attribute *attr;
2112 /* Parse the key. */
2113 key = text_get_token (text, ss_cstr ("("), NULL);
2117 attr = attribute_create (key);
2118 for (index = 1; ; index++)
2120 /* Parse the value. */
2124 value = text_get_token (text, ss_cstr ("\n"), NULL);
2127 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2132 length = strlen (value);
2133 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2135 value[length - 1] = '\0';
2136 attribute_add_value (attr, value + 1);
2141 _("Attribute value %s[%d] is not quoted: %s."),
2143 attribute_add_value (attr, value);
2146 /* Was this the last value for this attribute? */
2147 if (text_match (text, ')'))
2151 attrset_add (attrs, attr);
2153 attribute_destroy (attr);
2155 while (!text_match (text, '/'));
2158 /* Reads record type 7, subtype 17, which lists custom
2159 attributes on the data file. */
2161 parse_data_file_attributes (struct sfm_reader *r,
2162 const struct sfm_extension_record *record,
2163 struct dictionary *dict)
2165 struct text_record *text = open_text_record (r, record, true);
2166 parse_attributes (r, text, dict_get_attributes (dict));
2167 close_text_record (r, text);
2170 /* Parses record type 7, subtype 18, which lists custom
2171 attributes on individual variables. */
2173 parse_variable_attributes (struct sfm_reader *r,
2174 const struct sfm_extension_record *record,
2175 struct dictionary *dict)
2177 struct text_record *text;
2178 struct variable *var;
2180 text = open_text_record (r, record, true);
2181 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2182 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2183 close_text_record (r, text);
2187 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2189 size_t n_warnings = 0;
2192 for (i = 0; i < dict_get_var_cnt (dict); i++)
2194 struct variable *var = dict_get_var (dict, i);
2195 struct attrset *attrs = var_get_attributes (var);
2196 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2199 int value = atoi (attribute_get_value (attr, 0));
2221 role = ROLE_PARTITION;
2230 if (n_warnings++ == 0)
2231 sys_warn (r, -1, _("Invalid role for variable %s."),
2232 var_get_name (var));
2235 var_set_role (var, role);
2240 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2245 check_overflow (struct sfm_reader *r,
2246 const struct sfm_extension_record *record,
2247 size_t ofs, size_t length)
2249 size_t end = record->size * record->count;
2250 if (length >= end || ofs + length > end)
2252 sys_error (r, record->pos + end,
2253 _("Extension record subtype %d ends unexpectedly."),
2261 parse_long_string_value_labels (struct sfm_reader *r,
2262 const struct sfm_extension_record *record,
2263 struct dictionary *dict)
2265 const char *dict_encoding = dict_get_encoding (dict);
2266 size_t end = record->size * record->count;
2273 struct variable *var;
2278 /* Parse variable name length. */
2279 if (!check_overflow (r, record, ofs, 4))
2281 var_name_len = parse_int (r, record->data, ofs);
2284 /* Parse variable name, width, and number of labels. */
2285 if (!check_overflow (r, record, ofs, var_name_len + 8))
2287 var_name = recode_string_pool ("UTF-8", dict_encoding,
2288 (const char *) record->data + ofs,
2289 var_name_len, r->pool);
2290 width = parse_int (r, record->data, ofs + var_name_len);
2291 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2292 ofs += var_name_len + 8;
2294 /* Look up 'var' and validate. */
2295 var = dict_lookup_var (dict, var_name);
2297 sys_warn (r, record->pos + ofs,
2298 _("Ignoring long string value label record for "
2299 "unknown variable %s."), var_name);
2300 else if (var_is_numeric (var))
2302 sys_warn (r, record->pos + ofs,
2303 _("Ignoring long string value label record for "
2304 "numeric variable %s."), var_name);
2307 else if (width != var_get_width (var))
2309 sys_warn (r, record->pos + ofs,
2310 _("Ignoring long string value label record for variable "
2311 "%s because the record's width (%d) does not match the "
2312 "variable's width (%d)."),
2313 var_name, width, var_get_width (var));
2318 value_init_pool (r->pool, &value, width);
2319 for (i = 0; i < n_labels; i++)
2321 size_t value_length, label_length;
2322 bool skip = var == NULL;
2324 /* Parse value length. */
2325 if (!check_overflow (r, record, ofs, 4))
2327 value_length = parse_int (r, record->data, ofs);
2331 if (!check_overflow (r, record, ofs, value_length))
2335 if (value_length == width)
2336 memcpy (value_str_rw (&value, width),
2337 (const uint8_t *) record->data + ofs, width);
2340 sys_warn (r, record->pos + ofs,
2341 _("Ignoring long string value label %zu for "
2342 "variable %s, with width %d, that has bad value "
2344 i, var_get_name (var), width, value_length);
2348 ofs += value_length;
2350 /* Parse label length. */
2351 if (!check_overflow (r, record, ofs, 4))
2353 label_length = parse_int (r, record->data, ofs);
2357 if (!check_overflow (r, record, ofs, label_length))
2363 label = recode_string_pool ("UTF-8", dict_encoding,
2364 (const char *) record->data + ofs,
2365 label_length, r->pool);
2366 if (!var_add_value_label (var, &value, label))
2367 sys_warn (r, record->pos + ofs,
2368 _("Duplicate value label for `%.*s' on %s."),
2369 width, value_str (&value, width),
2370 var_get_name (var));
2371 pool_free (r->pool, label);
2373 ofs += label_length;
2381 parse_long_string_missing_values (struct sfm_reader *r,
2382 const struct sfm_extension_record *record,
2383 struct dictionary *dict)
2385 const char *dict_encoding = dict_get_encoding (dict);
2386 size_t end = record->size * record->count;
2391 struct missing_values mv;
2393 struct variable *var;
2394 int n_missing_values;
2398 /* Parse variable name length. */
2399 if (!check_overflow (r, record, ofs, 4))
2401 var_name_len = parse_int (r, record->data, ofs);
2404 /* Parse variable name. */
2405 if (!check_overflow (r, record, ofs, var_name_len + 1))
2407 var_name = recode_string_pool ("UTF-8", dict_encoding,
2408 (const char *) record->data + ofs,
2409 var_name_len, r->pool);
2410 ofs += var_name_len;
2412 /* Parse number of missing values. */
2413 n_missing_values = ((const uint8_t *) record->data)[ofs];
2414 if (n_missing_values < 1 || n_missing_values > 3)
2415 sys_warn (r, record->pos + ofs,
2416 _("Long string missing values record says variable %s "
2417 "has %d missing values, but only 1 to 3 missing values "
2419 var_name, n_missing_values);
2422 /* Look up 'var' and validate. */
2423 var = dict_lookup_var (dict, var_name);
2425 sys_warn (r, record->pos + ofs,
2426 _("Ignoring long string missing value record for "
2427 "unknown variable %s."), var_name);
2428 else if (var_is_numeric (var))
2430 sys_warn (r, record->pos + ofs,
2431 _("Ignoring long string missing value record for "
2432 "numeric variable %s."), var_name);
2437 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2438 for (i = 0; i < n_missing_values; i++)
2440 size_t value_length;
2442 /* Parse value length. */
2443 if (!check_overflow (r, record, ofs, 4))
2445 value_length = parse_int (r, record->data, ofs);
2449 if (!check_overflow (r, record, ofs, value_length))
2453 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2455 sys_warn (r, record->pos + ofs,
2456 _("Ignoring long string missing value %zu for variable "
2457 "%s, with width %d, that has bad value width %zu."),
2458 i, var_get_name (var), var_get_width (var),
2460 ofs += value_length;
2463 var_set_missing_values (var, &mv);
2471 static void partial_record (struct sfm_reader *);
2473 static void read_error (struct casereader *, const struct sfm_reader *);
2475 static bool read_case_number (struct sfm_reader *, double *);
2476 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2477 static int read_opcode (struct sfm_reader *);
2478 static bool read_compressed_number (struct sfm_reader *, double *);
2479 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2480 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2481 static bool skip_whole_strings (struct sfm_reader *, size_t);
2483 /* Reads and returns one case from READER's file. Returns a null
2484 pointer if not successful. */
2485 static struct ccase *
2486 sys_file_casereader_read (struct casereader *reader, void *r_)
2488 struct sfm_reader *r = r_;
2496 c = case_create (r->proto);
2498 for (i = 0; i < r->sfm_var_cnt; i++)
2500 struct sfm_var *sv = &r->sfm_vars[i];
2501 union value *v = case_data_rw_idx (c, sv->case_index);
2503 if (sv->var_width == 0)
2504 retval = read_case_number (r, &v->f);
2507 uint8_t *s = value_str_rw (v, sv->var_width);
2508 retval = read_case_string (r, s + sv->offset, sv->segment_width);
2511 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2513 sys_error (r, r->pos, _("File ends in partial string value."));
2525 if (r->case_cnt != -1)
2526 read_error (reader, r);
2531 /* Issues an error that R ends in a partial record. */
2533 partial_record (struct sfm_reader *r)
2535 sys_error (r, r->pos, _("File ends in partial case."));
2538 /* Issues an error that an unspecified error occurred SFM, and
2541 read_error (struct casereader *r, const struct sfm_reader *sfm)
2543 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2544 casereader_force_error (r);
2547 /* Reads a number from R and stores its value in *D.
2548 If R is compressed, reads a compressed number;
2549 otherwise, reads a number in the regular way.
2550 Returns true if successful, false if end of file is
2551 reached immediately. */
2553 read_case_number (struct sfm_reader *r, double *d)
2555 if (r->compression == SFM_COMP_NONE)
2558 if (!try_read_bytes (r, number, sizeof number))
2560 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2564 return read_compressed_number (r, d);
2567 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2568 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2569 discarded without being written to S. Reads compressed strings if S is
2570 compressed. Returns 1 if successful, 0 if end of file is reached
2571 immediately, or -1 for some kind of error. */
2573 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2575 size_t whole = ROUND_DOWN (length, 8);
2576 size_t partial = length % 8;
2580 int retval = read_whole_strings (r, s, whole);
2588 int retval = read_whole_strings (r, bounce, sizeof bounce);
2600 memcpy (s + whole, bounce, partial);
2606 /* Reads and returns the next compression opcode from R. */
2608 read_opcode (struct sfm_reader *r)
2610 assert (r->compression != SFM_COMP_NONE);
2614 if (r->opcode_idx >= sizeof r->opcodes)
2617 int retval = try_read_compressed_bytes (r, r->opcodes,
2623 opcode = r->opcodes[r->opcode_idx++];
2630 /* Reads a compressed number from R and stores its value in D.
2631 Returns true if successful, false if end of file is
2632 reached immediately. */
2634 read_compressed_number (struct sfm_reader *r, double *d)
2636 int opcode = read_opcode (r);
2644 return read_compressed_float (r, d);
2647 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2648 if (!r->corruption_warning)
2650 r->corruption_warning = true;
2651 sys_warn (r, r->pos,
2652 _("Possible compressed data corruption: "
2653 "compressed spaces appear in numeric field."));
2662 *d = opcode - r->bias;
2669 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2671 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2676 opcode = read_opcode (r);
2684 retval = read_compressed_bytes (r, dst, 8);
2685 return retval == 1 ? 1 : -1;
2688 memset (dst, ' ', 8);
2693 double value = opcode - r->bias;
2694 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2697 /* This has actually been seen "in the wild". The submitter of the
2698 file that showed that the contents decoded as spaces, but they
2699 were at the end of the field so it's possible that the null
2700 bytes just acted as null terminators. */
2702 else if (!r->corruption_warning)
2704 r->corruption_warning = true;
2705 sys_warn (r, r->pos,
2706 _("Possible compressed data corruption: "
2707 "string contains compressed integer (opcode %d)."),
2715 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2716 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2717 end of file is reached immediately, or -1 for some kind of error. */
2719 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2721 assert (length % 8 == 0);
2722 if (r->compression == SFM_COMP_NONE)
2723 return try_read_bytes (r, s, length);
2728 for (ofs = 0; ofs < length; ofs += 8)
2730 int retval = read_compressed_string (r, s + ofs);
2745 /* Skips LENGTH string bytes from R.
2746 LENGTH must be a multiple of 8.
2747 (LENGTH is also limited to 1024, but that's only because the
2748 current caller never needs more than that many bytes.)
2749 Returns true if successful, false if end of file is
2750 reached immediately. */
2752 skip_whole_strings (struct sfm_reader *r, size_t length)
2754 uint8_t buffer[1024];
2755 assert (length < sizeof buffer);
2756 return read_whole_strings (r, buffer, length);
2759 /* Helpers for reading records that contain structured text
2762 /* Maximum number of warnings to issue for a single text
2764 #define MAX_TEXT_WARNINGS 5
2769 struct substring buffer; /* Record contents. */
2770 off_t start; /* Starting offset in file. */
2771 size_t pos; /* Current position in buffer. */
2772 int n_warnings; /* Number of warnings issued or suppressed. */
2773 bool recoded; /* Recoded into UTF-8? */
2776 static struct text_record *
2777 open_text_record (struct sfm_reader *r,
2778 const struct sfm_extension_record *record,
2779 bool recode_to_utf8)
2781 struct text_record *text;
2782 struct substring raw;
2784 text = pool_alloc (r->pool, sizeof *text);
2785 raw = ss_buffer (record->data, record->size * record->count);
2786 text->start = record->pos;
2787 text->buffer = (recode_to_utf8
2788 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2791 text->n_warnings = 0;
2792 text->recoded = recode_to_utf8;
2797 /* Closes TEXT, frees its storage, and issues a final warning
2798 about suppressed warnings if necesary. */
2800 close_text_record (struct sfm_reader *r, struct text_record *text)
2802 if (text->n_warnings > MAX_TEXT_WARNINGS)
2803 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2804 text->n_warnings - MAX_TEXT_WARNINGS);
2806 pool_free (r->pool, ss_data (text->buffer));
2809 /* Reads a variable=value pair from TEXT.
2810 Looks up the variable in DICT and stores it into *VAR.
2811 Stores a null-terminated value into *VALUE. */
2813 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2814 struct text_record *text,
2815 struct variable **var, char **value)
2819 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2822 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2826 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2827 ss_buffer ("\t\0", 2));
2835 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2836 struct text_record *text, struct substring delimiters,
2837 struct variable **var)
2841 name = text_get_token (text, delimiters, NULL);
2845 *var = dict_lookup_var (dict, name);
2849 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2856 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2857 struct text_record *text, struct substring delimiters,
2858 struct variable **var)
2860 char *short_name = text_get_token (text, delimiters, NULL);
2861 if (short_name == NULL)
2864 *var = dict_lookup_var (dict, short_name);
2866 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2871 /* Displays a warning for the current file position, limiting the
2872 number to MAX_TEXT_WARNINGS for TEXT. */
2874 text_warn (struct sfm_reader *r, struct text_record *text,
2875 const char *format, ...)
2877 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2881 va_start (args, format);
2882 sys_msg (r, text->start + text->pos, MW, format, args);
2888 text_get_token (struct text_record *text, struct substring delimiters,
2891 struct substring token;
2894 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2897 end = &ss_data (token)[ss_length (token)];
2898 if (delimiter != NULL)
2901 return ss_data (token);
2904 /* Reads a integer value expressed in decimal, then a space, then a string that
2905 consists of exactly as many bytes as specified by the integer, then a space,
2906 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2907 buffer (so the caller should not free the string). */
2909 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2917 while (text->pos < text->buffer.length)
2919 int c = text->buffer.string[text->pos];
2920 if (c < '0' || c > '9')
2922 n = (n * 10) + (c - '0');
2925 if (text->pos >= text->buffer.length || start == text->pos)
2927 sys_warn (r, text->start,
2928 _("Expecting digit at offset %zu in MRSETS record."),
2933 if (!text_match (text, ' '))
2935 sys_warn (r, text->start,
2936 _("Expecting space at offset %zu in MRSETS record."),
2941 if (text->pos + n > text->buffer.length)
2943 sys_warn (r, text->start,
2944 _("%zu-byte string starting at offset %zu "
2945 "exceeds record length %zu."),
2946 n, text->pos, text->buffer.length);
2950 s = &text->buffer.string[text->pos];
2953 sys_warn (r, text->start,
2954 _("Expecting space at offset %zu following %zu-byte string."),
2964 text_match (struct text_record *text, char c)
2966 if (text->buffer.string[text->pos] == c)
2975 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2976 inside the TEXT's string. */
2978 text_pos (const struct text_record *text)
2984 text_get_all (const struct text_record *text)
2986 return text->buffer.string;
2991 /* Displays a corruption message. */
2993 sys_msg (struct sfm_reader *r, off_t offset,
2994 int class, const char *format, va_list args)
2999 ds_init_empty (&text);
3001 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3002 fh_get_file_name (r->fh), (long long int) offset);
3004 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3005 ds_put_vformat (&text, format, args);
3007 m.category = msg_class_to_category (class);
3008 m.severity = msg_class_to_severity (class);
3014 m.text = ds_cstr (&text);
3019 /* Displays a warning for offset OFFSET in the file. */
3021 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3025 va_start (args, format);
3026 sys_msg (r, offset, MW, format, args);
3030 /* Displays an error for the current file position,
3031 marks it as in an error state,
3032 and aborts reading it using longjmp. */
3034 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3038 va_start (args, format);
3039 sys_msg (r, offset, ME, format, args);
3045 /* Reads BYTE_CNT bytes into BUF.
3046 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3047 Returns -1 if an I/O error or a partial read occurs.
3048 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3051 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3052 void *buf, size_t byte_cnt)
3054 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3055 r->pos += bytes_read;
3056 if (bytes_read == byte_cnt)
3058 else if (ferror (r->file))
3060 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3063 else if (!eof_is_ok || bytes_read != 0)
3065 sys_error (r, r->pos, _("Unexpected end of file."));
3072 /* Reads BYTE_CNT into BUF.
3073 Returns true if successful.
3074 Returns false upon I/O error or if end-of-file is encountered. */
3076 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3078 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3081 /* Reads BYTE_CNT bytes into BUF.
3082 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3083 Returns 0 if an immediate end-of-file is encountered.
3084 Returns -1 if an I/O error or a partial read occurs. */
3086 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3088 return read_bytes_internal (r, true, buf, byte_cnt);
3091 /* Reads a 32-bit signed integer from R and stores its value in host format in
3092 *X. Returns true if successful, otherwise false. */
3094 read_int (struct sfm_reader *r, int *x)
3097 if (read_bytes (r, integer, sizeof integer) != 1)
3099 *x = integer_get (r->integer_format, integer, sizeof integer);
3104 read_uint (struct sfm_reader *r, unsigned int *x)
3109 ok = read_int (r, &y);
3114 /* Reads a 64-bit signed integer from R and returns its value in
3117 read_int64 (struct sfm_reader *r, long long int *x)
3120 if (read_bytes (r, integer, sizeof integer) != 1)
3122 *x = integer_get (r->integer_format, integer, sizeof integer);
3126 /* Reads a 64-bit signed integer from R and returns its value in
3129 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3134 ok = read_int64 (r, &y);
3140 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3142 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3146 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3148 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3151 /* Reads exactly SIZE - 1 bytes into BUFFER
3152 and stores a null byte into BUFFER[SIZE - 1]. */
3154 read_string (struct sfm_reader *r, char *buffer, size_t size)
3159 ok = read_bytes (r, buffer, size - 1);
3161 buffer[size - 1] = '\0';
3165 /* Skips BYTES bytes forward in R. */
3167 skip_bytes (struct sfm_reader *r, size_t bytes)
3172 size_t chunk = MIN (sizeof buffer, bytes);
3173 if (!read_bytes (r, buffer, chunk))
3181 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3182 been replaced by LFs.
3184 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3185 files that use CR-only line ends in the file label and extra product
3188 fix_line_ends (const char *s)
3192 d = dst = xmalloc (strlen (s) + 1);
3211 read_ztrailer (struct sfm_reader *r,
3212 long long int zheader_ofs,
3213 long long int ztrailer_len);
3216 zalloc (voidpf pool_, uInt items, uInt size)
3218 struct pool *pool = pool_;
3220 return (!size || xalloc_oversized (items, size)
3222 : pool_malloc (pool, items * size));
3226 zfree (voidpf pool_, voidpf address)
3228 struct pool *pool = pool_;
3230 pool_free (pool, address);
3234 read_zheader (struct sfm_reader *r)
3237 long long int zheader_ofs;
3238 long long int ztrailer_ofs;
3239 long long int ztrailer_len;
3241 if (!read_int64 (r, &zheader_ofs)
3242 || !read_int64 (r, &ztrailer_ofs)
3243 || !read_int64 (r, &ztrailer_len))
3246 if (zheader_ofs != pos)
3248 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3249 "(expected %#llx)."),
3250 zheader_ofs, (long long int) pos);
3254 if (ztrailer_ofs < r->pos)
3256 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3261 if (ztrailer_len < 24 || ztrailer_len % 24)
3263 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3267 r->ztrailer_ofs = ztrailer_ofs;
3268 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3271 if (r->zin_buf == NULL)
3273 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3274 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3275 r->zstream.next_in = NULL;
3276 r->zstream.avail_in = 0;
3279 r->zstream.zalloc = zalloc;
3280 r->zstream.zfree = zfree;
3281 r->zstream.opaque = r->pool;
3283 return open_zstream (r);
3287 seek (struct sfm_reader *r, off_t offset)
3289 if (fseeko (r->file, offset, SEEK_SET))
3290 sys_error (r, 0, _("%s: seek failed (%s)."),
3291 fh_get_file_name (r->fh), strerror (errno));
3295 /* Performs some additional consistency checks on the ZLIB compressed data
3298 read_ztrailer (struct sfm_reader *r,
3299 long long int zheader_ofs,
3300 long long int ztrailer_len)
3302 long long int expected_uncmp_ofs;
3303 long long int expected_cmp_ofs;
3306 unsigned int block_size;
3307 unsigned int n_blocks;
3311 if (fstat (fileno (r->file), &s))
3313 sys_error (ME, 0, _("%s: stat failed (%s)."),
3314 fh_get_file_name (r->fh), strerror (errno));
3318 if (!S_ISREG (s.st_mode))
3320 /* We can't seek to the trailer and then back to the data in this file,
3321 so skip doing extra checks. */
3325 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3326 sys_warn (r, r->pos,
3327 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3328 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3330 seek (r, r->ztrailer_ofs);
3332 /* Read fixed header from ZLIB data trailer. */
3333 if (!read_int64 (r, &bias))
3335 if (-bias != r->bias)
3337 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3338 "file header bias (%.2f)."),
3343 if (!read_int64 (r, &zero))
3346 sys_warn (r, r->pos,
3347 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3349 if (!read_uint (r, &block_size))
3351 if (block_size != ZBLOCK_SIZE)
3352 sys_warn (r, r->pos,
3353 _("ZLIB trailer specifies unexpected %u-byte block size."),
3356 if (!read_uint (r, &n_blocks))
3358 if (n_blocks != (ztrailer_len - 24) / 24)
3360 sys_error (r, r->pos,
3361 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3363 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3367 expected_uncmp_ofs = zheader_ofs;
3368 expected_cmp_ofs = zheader_ofs + 24;
3369 for (i = 0; i < n_blocks; i++)
3371 off_t desc_ofs = r->pos;
3372 unsigned long long int uncompressed_ofs;
3373 unsigned long long int compressed_ofs;
3374 unsigned int uncompressed_size;
3375 unsigned int compressed_size;
3377 if (!read_uint64 (r, &uncompressed_ofs)
3378 || !read_uint64 (r, &compressed_ofs)
3379 || !read_uint (r, &uncompressed_size)
3380 || !read_uint (r, &compressed_size))
3383 if (uncompressed_ofs != expected_uncmp_ofs)
3385 sys_error (r, desc_ofs,
3386 _("ZLIB block descriptor %u reported uncompressed data "
3387 "offset %#llx, when %#llx was expected."),
3388 i, uncompressed_ofs, expected_uncmp_ofs);
3392 if (compressed_ofs != expected_cmp_ofs)
3394 sys_error (r, desc_ofs,
3395 _("ZLIB block descriptor %u reported compressed data "
3396 "offset %#llx, when %#llx was expected."),
3397 i, compressed_ofs, expected_cmp_ofs);
3401 if (i < n_blocks - 1)
3403 if (uncompressed_size != block_size)
3404 sys_warn (r, desc_ofs,
3405 _("ZLIB block descriptor %u reported block size %#x, "
3406 "when %#x was expected."),
3407 i, uncompressed_size, block_size);
3411 if (uncompressed_size > block_size)
3412 sys_warn (r, desc_ofs,
3413 _("ZLIB block descriptor %u reported block size %#x, "
3414 "when at most %#x was expected."),
3415 i, uncompressed_size, block_size);
3418 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3419 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3420 This code checks for an expansion of more than 14.3% plus 11
3422 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3424 sys_error (r, desc_ofs,
3425 _("ZLIB block descriptor %u reports compressed size %u "
3426 "and uncompressed size %u."),
3427 i, compressed_size, uncompressed_size);
3431 expected_uncmp_ofs += uncompressed_size;
3432 expected_cmp_ofs += compressed_size;
3435 if (expected_cmp_ofs != r->ztrailer_ofs)
3437 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3438 "would be expected from block descriptors."),
3439 r->ztrailer_ofs, expected_cmp_ofs);
3443 seek (r, zheader_ofs + 24);
3448 open_zstream (struct sfm_reader *r)
3452 r->zout_pos = r->zout_end = 0;
3453 error = inflateInit (&r->zstream);
3456 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3464 close_zstream (struct sfm_reader *r)
3468 error = inflateEnd (&r->zstream);
3471 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3479 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3481 uint8_t *buf = buf_;
3490 /* Use already inflated data if there is any. */
3491 if (r->zout_pos < r->zout_end)
3493 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3494 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3503 /* We need to inflate some more data.
3504 Get some more input data if we don't have any. */
3505 if (r->zstream.avail_in == 0)
3507 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3512 int retval = try_read_bytes (r, r->zin_buf, n);
3515 r->zstream.avail_in = n;
3516 r->zstream.next_in = r->zin_buf;
3520 /* Inflate the (remaining) input data. */
3521 r->zstream.avail_out = ZOUT_BUF_SIZE;
3522 r->zstream.next_out = r->zout_buf;
3523 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3525 r->zout_end = r->zstream.next_out - r->zout_buf;
3526 if (r->zout_end == 0)
3528 if (error != Z_STREAM_END)
3530 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3534 else if (!close_zstream (r) || !open_zstream (r))
3539 /* Process the output data and ignore 'error' for now. ZLIB will
3540 present it to us again on the next inflate() call. */
3546 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3548 if (r->compression == SFM_COMP_SIMPLE)
3549 return read_bytes (r, buf, byte_cnt);
3552 int retval = read_bytes_zlib (r, buf, byte_cnt);
3554 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3560 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3562 if (r->compression == SFM_COMP_SIMPLE)
3563 return try_read_bytes (r, buf, byte_cnt);
3565 return read_bytes_zlib (r, buf, byte_cnt);
3568 /* Reads a 64-bit floating-point number from R and returns its
3569 value in host format. */
3571 read_compressed_float (struct sfm_reader *r, double *d)
3575 if (!read_compressed_bytes (r, number, sizeof number))
3578 *d = float_get_double (r->float_format, number);
3582 static const struct casereader_class sys_file_casereader_class =
3584 sys_file_casereader_read,
3585 sys_file_casereader_destroy,