1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
30 #include "data/attributes.h"
31 #include "data/case.h"
32 #include "data/casereader-provider.h"
33 #include "data/casereader.h"
34 #include "data/dictionary.h"
35 #include "data/file-handle-def.h"
36 #include "data/file-name.h"
37 #include "data/format.h"
38 #include "data/identifier.h"
39 #include "data/missing-values.h"
40 #include "data/mrset.h"
41 #include "data/short-names.h"
42 #include "data/value-labels.h"
43 #include "data/value.h"
44 #include "data/variable.h"
45 #include "libpspp/array.h"
46 #include "libpspp/assertion.h"
47 #include "libpspp/compiler.h"
48 #include "libpspp/i18n.h"
49 #include "libpspp/message.h"
50 #include "libpspp/misc.h"
51 #include "libpspp/pool.h"
52 #include "libpspp/str.h"
53 #include "libpspp/stringi-set.h"
55 #include "gl/c-strtod.h"
56 #include "gl/c-ctype.h"
57 #include "gl/inttostr.h"
58 #include "gl/localcharset.h"
59 #include "gl/minmax.h"
60 #include "gl/unlocked-io.h"
61 #include "gl/xalloc.h"
62 #include "gl/xalloc-oversized.h"
66 #define _(msgid) gettext (msgid)
67 #define N_(msgid) (msgid)
71 /* subtypes 0-2 unknown */
72 EXT_INTEGER = 3, /* Machine integer info. */
73 EXT_FLOAT = 4, /* Machine floating-point info. */
74 EXT_VAR_SETS = 5, /* Variable sets. */
75 EXT_DATE = 6, /* DATE. */
76 EXT_MRSETS = 7, /* Multiple response sets. */
77 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
78 /* subtype 9 unknown */
79 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
80 EXT_DISPLAY = 11, /* Variable display parameters. */
81 /* subtype 12 unknown */
82 EXT_LONG_NAMES = 13, /* Long variable names. */
83 EXT_LONG_STRINGS = 14, /* Long strings. */
84 /* subtype 15 unknown */
85 EXT_NCASES = 16, /* Extended number of cases. */
86 EXT_FILE_ATTRS = 17, /* Data file attributes. */
87 EXT_VAR_ATTRS = 18, /* Variable attributes. */
88 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
89 EXT_ENCODING = 20, /* Character encoding. */
90 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
91 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
92 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
95 /* Fields from the top-level header record. */
96 struct sfm_header_record
98 char magic[5]; /* First 4 bytes of file, then null. */
99 int weight_idx; /* 0 if unweighted, otherwise a var index. */
100 int nominal_case_size; /* Number of var positions. */
102 /* These correspond to the members of struct sfm_file_info or a dictionary
103 but in the system file's encoding rather than ASCII. */
104 char creation_date[10]; /* "dd mmm yy". */
105 char creation_time[9]; /* "hh:mm:ss". */
106 char eye_catcher[61]; /* Eye-catcher string, then product name. */
107 char file_label[65]; /* File label. */
110 struct sfm_var_record
117 int missing_value_code;
120 struct variable *var;
123 struct sfm_value_label
129 struct sfm_value_label_record
132 struct sfm_value_label *labels;
139 struct sfm_document_record
146 struct sfm_extension_record
148 int subtype; /* Record subtype. */
149 off_t pos; /* Starting offset in file. */
150 size_t size; /* Size of data elements. */
151 size_t count; /* Number of data elements. */
152 void *data; /* Contents. */
155 /* System file reader. */
158 /* Resource tracking. */
159 struct pool *pool; /* All system file state. */
160 jmp_buf bail_out; /* longjmp() target for error handling. */
163 struct file_handle *fh; /* File handle. */
164 struct fh_lock *lock; /* Mutual exclusion for file handle. */
165 FILE *file; /* File stream. */
166 off_t pos; /* Position in file. */
167 bool error; /* I/O or corruption error? */
168 struct caseproto *proto; /* Format of output cases. */
171 enum integer_format integer_format; /* On-disk integer format. */
172 enum float_format float_format; /* On-disk floating point format. */
173 struct sfm_var *sfm_vars; /* Variables. */
174 size_t sfm_var_cnt; /* Number of variables. */
175 casenumber case_cnt; /* Number of cases */
176 const char *encoding; /* String encoding. */
179 enum sfm_compression compression;
180 double bias; /* Compression bias, usually 100.0. */
181 uint8_t opcodes[8]; /* Current block of opcodes. */
182 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
183 bool corruption_warning; /* Warned about possible corruption? */
185 /* ZLIB decompression. */
186 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
187 #define ZIN_BUF_SIZE 4096
188 uint8_t *zin_buf; /* Inflation input buffer. */
189 #define ZOUT_BUF_SIZE 16384
190 uint8_t *zout_buf; /* Inflation output buffer. */
191 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
192 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
193 z_stream zstream; /* ZLIB inflater. */
196 static const struct casereader_class sys_file_casereader_class;
198 static bool close_reader (struct sfm_reader *);
200 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
201 const struct sfm_var_record *,
204 static void sys_msg (struct sfm_reader *r, off_t, int class,
205 const char *format, va_list args)
206 PRINTF_FORMAT (4, 0);
207 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
208 PRINTF_FORMAT (3, 4);
209 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
213 static void read_bytes (struct sfm_reader *, void *, size_t);
214 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
215 static int read_int (struct sfm_reader *);
216 static long long int read_int64 (struct sfm_reader *);
217 static void read_string (struct sfm_reader *, char *, size_t);
218 static void skip_bytes (struct sfm_reader *, size_t);
220 /* ZLIB compressed data handling. */
221 static void read_zheader (struct sfm_reader *);
222 static void open_zstream (struct sfm_reader *);
223 static void close_zstream (struct sfm_reader *);
224 static bool read_bytes_zlib (struct sfm_reader *, void *, size_t);
225 static void read_compressed_bytes (struct sfm_reader *, void *, size_t);
226 static bool try_read_compressed_bytes (struct sfm_reader *, void *, size_t);
227 static double read_compressed_float (struct sfm_reader *);
229 static char *fix_line_ends (const char *);
231 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
232 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
234 static void read_variable_record (struct sfm_reader *,
235 struct sfm_var_record *);
236 static void read_value_label_record (struct sfm_reader *,
237 struct sfm_value_label_record *,
239 static struct sfm_document_record *read_document_record (struct sfm_reader *);
240 static struct sfm_extension_record *read_extension_record (
241 struct sfm_reader *, int subtype);
242 static void skip_extension_record (struct sfm_reader *, int subtype);
244 static const char *choose_encoding (
246 const struct sfm_header_record *,
247 const struct sfm_extension_record *ext_integer,
248 const struct sfm_extension_record *ext_encoding);
250 static struct text_record *open_text_record (
251 struct sfm_reader *, const struct sfm_extension_record *,
252 bool recode_to_utf8);
253 static void close_text_record (struct sfm_reader *,
254 struct text_record *);
255 static bool read_variable_to_value_pair (struct sfm_reader *,
257 struct text_record *,
258 struct variable **var, char **value);
259 static void text_warn (struct sfm_reader *r, struct text_record *text,
260 const char *format, ...)
261 PRINTF_FORMAT (3, 4);
262 static char *text_get_token (struct text_record *,
263 struct substring delimiters, char *delimiter);
264 static bool text_match (struct text_record *, char c);
265 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
266 struct text_record *,
267 struct substring delimiters,
269 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
270 struct text_record *,
271 struct substring delimiters,
273 static const char *text_parse_counted_string (struct sfm_reader *,
274 struct text_record *);
275 static size_t text_pos (const struct text_record *);
276 static const char *text_get_all (const struct text_record *);
278 static bool close_reader (struct sfm_reader *r);
280 /* Dictionary reader. */
288 static void read_header (struct sfm_reader *, struct sfm_read_info *,
289 struct sfm_header_record *);
290 static void parse_header (struct sfm_reader *,
291 const struct sfm_header_record *,
292 struct sfm_read_info *, struct dictionary *);
293 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
294 struct sfm_var_record *, size_t n);
295 static void parse_format_spec (struct sfm_reader *, off_t pos,
296 unsigned int format, enum which_format,
297 struct variable *, int *format_warning_cnt);
298 static void parse_document (struct dictionary *, struct sfm_document_record *);
299 static void parse_display_parameters (struct sfm_reader *,
300 const struct sfm_extension_record *,
301 struct dictionary *);
302 static void parse_machine_integer_info (struct sfm_reader *,
303 const struct sfm_extension_record *,
304 struct sfm_read_info *);
305 static void parse_machine_float_info (struct sfm_reader *,
306 const struct sfm_extension_record *);
307 static void parse_extra_product_info (struct sfm_reader *,
308 const struct sfm_extension_record *,
309 struct sfm_read_info *);
310 static void parse_mrsets (struct sfm_reader *,
311 const struct sfm_extension_record *,
312 struct dictionary *);
313 static void parse_long_var_name_map (struct sfm_reader *,
314 const struct sfm_extension_record *,
315 struct dictionary *);
316 static void parse_long_string_map (struct sfm_reader *,
317 const struct sfm_extension_record *,
318 struct dictionary *);
319 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
320 const struct sfm_var_record *,
322 const struct sfm_value_label_record *);
323 static void parse_data_file_attributes (struct sfm_reader *,
324 const struct sfm_extension_record *,
325 struct dictionary *);
326 static void parse_variable_attributes (struct sfm_reader *,
327 const struct sfm_extension_record *,
328 struct dictionary *);
329 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
330 static void parse_long_string_value_labels (struct sfm_reader *,
331 const struct sfm_extension_record *,
332 struct dictionary *);
333 static void parse_long_string_missing_values (struct sfm_reader *,
334 const struct sfm_extension_record *,
335 struct dictionary *);
337 /* Frees the strings inside INFO. */
339 sfm_read_info_destroy (struct sfm_read_info *info)
343 free (info->creation_date);
344 free (info->creation_time);
345 free (info->product);
346 free (info->product_ext);
350 /* Opens the system file designated by file handle FH for reading. Reads the
351 system file's dictionary into *DICT.
353 Ordinarily the reader attempts to automatically detect the character
354 encoding based on the file's contents. This isn't always possible,
355 especially for files written by old versions of SPSS or PSPP, so specifying
356 a nonnull ENCODING overrides the choice of character encoding.
358 If INFO is non-null, then it receives additional info about the system file,
359 which the caller must eventually free with sfm_read_info_destroy() when it
360 is no longer needed. */
362 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
363 struct dictionary **dictp, struct sfm_read_info *infop)
365 struct sfm_reader *volatile r = NULL;
366 struct sfm_read_info *volatile info;
368 struct sfm_header_record header;
370 struct sfm_var_record *vars;
371 size_t n_vars, allocated_vars;
373 struct sfm_value_label_record *labels;
374 size_t n_labels, allocated_labels;
376 struct sfm_document_record *document;
378 struct sfm_extension_record *extensions[32];
380 struct dictionary *volatile dict = NULL;
383 /* Create and initialize reader. */
384 r = pool_create_container (struct sfm_reader, pool);
390 r->opcode_idx = sizeof r->opcodes;
391 r->corruption_warning = false;
392 r->zin_buf = r->zout_buf = NULL;
394 info = infop ? infop : xmalloc (sizeof *info);
395 memset (info, 0, sizeof *info);
397 /* TRANSLATORS: this fragment will be interpolated into
398 messages in fh_lock() that identify types of files. */
399 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
403 r->file = fn_open (fh_get_file_name (fh), "rb");
406 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
407 fh_get_file_name (r->fh), strerror (errno));
411 if (setjmp (r->bail_out))
415 read_header (r, info, &header);
418 n_vars = allocated_vars = 0;
421 n_labels = allocated_labels = 0;
425 memset (extensions, 0, sizeof extensions);
435 read_int (r); /* Skip filler. */
442 if (n_vars >= allocated_vars)
443 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
445 read_variable_record (r, &vars[n_vars++]);
449 if (n_labels >= allocated_labels)
450 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
452 read_value_label_record (r, &labels[n_labels++], n_vars);
456 /* A Type 4 record is always immediately after a type 3 record,
457 so the code for type 3 records reads the type 4 record too. */
458 sys_error (r, r->pos, _("Misplaced type 4 record."));
461 if (document != NULL)
462 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
463 document = read_document_record (r);
467 subtype = read_int (r);
468 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
471 _("Unrecognized record type 7, subtype %d. Please "
472 "send a copy of this file, and the syntax which "
473 "created it to %s."),
474 subtype, PACKAGE_BUGREPORT);
475 skip_extension_record (r, subtype);
477 else if (extensions[subtype] != NULL)
480 _("Record type 7, subtype %d found here has the same "
481 "type as the record found near offset 0x%llx. "
482 "Please send a copy of this file, and the syntax "
483 "which created it to %s."),
484 subtype, (long long int) extensions[subtype]->pos,
486 skip_extension_record (r, subtype);
489 extensions[subtype] = read_extension_record (r, subtype);
493 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
498 if (r->compression == SFM_COMP_ZLIB)
501 /* Now actually parse what we read.
503 First, figure out the correct character encoding, because this determines
504 how the rest of the header data is to be interpreted. */
505 dict = dict_create (encoding
507 : choose_encoding (r, &header, extensions[EXT_INTEGER],
508 extensions[EXT_ENCODING]));
509 r->encoding = dict_get_encoding (dict);
511 /* These records don't use variables at all. */
512 if (document != NULL)
513 parse_document (dict, document);
515 if (extensions[EXT_INTEGER] != NULL)
516 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
518 if (extensions[EXT_FLOAT] != NULL)
519 parse_machine_float_info (r, extensions[EXT_FLOAT]);
521 if (extensions[EXT_PRODUCT_INFO] != NULL)
522 parse_extra_product_info (r, extensions[EXT_PRODUCT_INFO], info);
524 if (extensions[EXT_FILE_ATTRS] != NULL)
525 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
527 parse_header (r, &header, info, dict);
529 /* Parse the variable records, the basis of almost everything else. */
530 parse_variable_records (r, dict, vars, n_vars);
532 /* Parse value labels and the weight variable immediately after the variable
533 records. These records use indexes into var_recs[], so we must parse them
534 before those indexes become invalidated by very long string variables. */
535 for (i = 0; i < n_labels; i++)
536 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
537 if (header.weight_idx != 0)
539 struct variable *weight_var;
541 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
543 if (var_is_numeric (weight_var))
544 dict_set_weight (dict, weight_var);
546 sys_warn (r, -1, _("Ignoring string variable `%s' set "
547 "as weighting variable."),
548 var_get_name (weight_var));
551 if (extensions[EXT_DISPLAY] != NULL)
552 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
554 /* The following records use short names, so they need to be parsed before
555 parse_long_var_name_map() changes short names to long names. */
556 if (extensions[EXT_MRSETS] != NULL)
557 parse_mrsets (r, extensions[EXT_MRSETS], dict);
559 if (extensions[EXT_MRSETS2] != NULL)
560 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
562 if (extensions[EXT_LONG_STRINGS] != NULL)
563 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
565 /* Now rename variables to their long names. */
566 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
568 /* The following records use long names, so they need to follow renaming. */
569 if (extensions[EXT_VAR_ATTRS] != NULL)
571 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
573 /* Roles use the $@Role attribute. */
574 assign_variable_roles (r, dict);
577 if (extensions[EXT_LONG_LABELS] != NULL)
578 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
579 if (extensions[EXT_LONG_MISSING] != NULL)
580 parse_long_string_missing_values (r, extensions[EXT_LONG_MISSING], dict);
582 /* Warn if the actual amount of data per case differs from the
583 amount that the header claims. SPSS version 13 gets this
584 wrong when very long strings are involved, so don't warn in
586 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
587 && info->version_major != 13)
588 sys_warn (r, -1, _("File header claims %d variable positions but "
589 "%zu were read from file."),
590 header.nominal_case_size, n_vars);
592 /* Create an index of dictionary variable widths for
593 sfm_read_case to use. We cannot use the `struct variable's
594 from the dictionary we created, because the caller owns the
595 dictionary and may destroy or modify its variables. */
596 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
597 pool_register (r->pool, free, r->sfm_vars);
598 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
603 sfm_read_info_destroy (info);
607 return casereader_create_sequential
609 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
610 &sys_file_casereader_class, r);
615 sfm_read_info_destroy (info);
625 /* Closes a system file after we're done with it.
626 Returns true if an I/O error has occurred on READER, false
629 close_reader (struct sfm_reader *r)
638 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
640 msg (ME, _("Error closing system file `%s': %s."),
641 fh_get_file_name (r->fh), strerror (errno));
651 pool_destroy (r->pool);
656 /* Destroys READER. */
658 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
660 struct sfm_reader *r = r_;
664 /* Returns true if FILE is an SPSS system file,
667 sfm_detect (FILE *file)
671 if (fread (magic, 4, 1, file) != 1)
675 return (!strcmp (ASCII_MAGIC, magic)
676 || !strcmp (ASCII_ZMAGIC, magic)
677 || !strcmp (EBCDIC_MAGIC, magic));
680 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
681 except for the string fields in *INFO, which parse_header() will initialize
682 later once the file's encoding is known. */
684 read_header (struct sfm_reader *r, struct sfm_read_info *info,
685 struct sfm_header_record *header)
687 uint8_t raw_layout_code[4];
692 read_string (r, header->magic, sizeof header->magic);
693 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
695 if (!strcmp (ASCII_MAGIC, header->magic)
696 || !strcmp (EBCDIC_MAGIC, header->magic))
698 else if (!strcmp (ASCII_ZMAGIC, header->magic))
701 sys_error (r, 0, _("This is not an SPSS system file."));
703 /* Identify integer format. */
704 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
705 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
707 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
709 || (r->integer_format != INTEGER_MSB_FIRST
710 && r->integer_format != INTEGER_LSB_FIRST))
711 sys_error (r, 64, _("This is not an SPSS system file."));
713 header->nominal_case_size = read_int (r);
714 if (header->nominal_case_size < 0
715 || header->nominal_case_size > INT_MAX / 16)
716 header->nominal_case_size = -1;
718 compressed = read_int (r);
722 r->compression = SFM_COMP_NONE;
723 else if (compressed == 1)
724 r->compression = SFM_COMP_SIMPLE;
725 else if (compressed != 0)
726 sys_error (r, 0, "System file header has invalid compression "
727 "value %d.", compressed);
732 r->compression = SFM_COMP_ZLIB;
734 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
735 "compression value %d.", compressed);
738 header->weight_idx = read_int (r);
740 r->case_cnt = read_int (r);
741 if ( r->case_cnt > INT_MAX / 2)
744 /* Identify floating-point format and obtain compression bias. */
745 read_bytes (r, raw_bias, sizeof raw_bias);
746 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
748 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
750 if (memcmp (raw_bias, zero_bias, 8))
751 sys_warn (r, r->pos - 8,
752 _("Compression bias is not the usual "
753 "value of 100, or system file uses unrecognized "
754 "floating-point format."));
757 /* Some software is known to write all-zeros to this
758 field. Such software also writes floating-point
759 numbers in the format that we expect by default
760 (it seems that all software most likely does, in
761 reality), so don't warn in this case. */
764 if (r->integer_format == INTEGER_MSB_FIRST)
765 r->float_format = FLOAT_IEEE_DOUBLE_BE;
767 r->float_format = FLOAT_IEEE_DOUBLE_LE;
769 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
771 read_string (r, header->creation_date, sizeof header->creation_date);
772 read_string (r, header->creation_time, sizeof header->creation_time);
773 read_string (r, header->file_label, sizeof header->file_label);
776 info->integer_format = r->integer_format;
777 info->float_format = r->float_format;
778 info->compression = r->compression;
779 info->case_cnt = r->case_cnt;
782 /* Reads a variable (type 2) record from R into RECORD. */
784 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
786 int has_variable_label;
788 memset (record, 0, sizeof *record);
790 record->pos = r->pos;
791 record->width = read_int (r);
792 has_variable_label = read_int (r);
793 record->missing_value_code = read_int (r);
794 record->print_format = read_int (r);
795 record->write_format = read_int (r);
796 read_bytes (r, record->name, sizeof record->name);
798 if (has_variable_label == 1)
800 enum { MAX_LABEL_LEN = 255 };
801 size_t len, read_len;
805 /* Read up to MAX_LABEL_LEN bytes of label. */
806 read_len = MIN (MAX_LABEL_LEN, len);
807 record->label = pool_malloc (r->pool, read_len + 1);
808 read_string (r, record->label, read_len + 1);
810 /* Skip unread label bytes. */
811 skip_bytes (r, len - read_len);
813 /* Skip label padding up to multiple of 4 bytes. */
814 skip_bytes (r, ROUND_UP (len, 4) - len);
816 else if (has_variable_label != 0)
817 sys_error (r, record->pos,
818 _("Variable label indicator field is not 0 or 1."));
820 /* Set missing values. */
821 if (record->missing_value_code != 0)
823 int code = record->missing_value_code;
824 if (record->width == 0)
826 if (code < -3 || code > 3 || code == -1)
827 sys_error (r, record->pos,
828 _("Numeric missing value indicator field is not "
829 "-3, -2, 0, 1, 2, or 3."));
833 if (code < 1 || code > 3)
834 sys_error (r, record->pos,
835 _("String missing value indicator field is not "
839 read_bytes (r, record->missing, 8 * abs (code));
843 /* Reads value labels from R into RECORD. */
845 read_value_label_record (struct sfm_reader *r,
846 struct sfm_value_label_record *record,
851 /* Read type 3 record. */
852 record->pos = r->pos;
853 record->n_labels = read_int (r);
854 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
855 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
857 record->labels = pool_nmalloc (r->pool, record->n_labels,
858 sizeof *record->labels);
859 for (i = 0; i < record->n_labels; i++)
861 struct sfm_value_label *label = &record->labels[i];
862 unsigned char label_len;
865 read_bytes (r, label->value, sizeof label->value);
867 /* Read label length. */
868 read_bytes (r, &label_len, sizeof label_len);
869 padded_len = ROUND_UP (label_len + 1, 8);
871 /* Read label, padding. */
872 label->label = pool_malloc (r->pool, padded_len + 1);
873 read_bytes (r, label->label, padded_len - 1);
874 label->label[label_len] = '\0';
877 /* Read record type of type 4 record. */
878 if (read_int (r) != 4)
879 sys_error (r, r->pos - 4,
880 _("Variable index record (type 4) does not immediately "
881 "follow value label record (type 3) as it should."));
883 /* Read number of variables associated with value label from type 4
885 record->n_vars = read_int (r);
886 if (record->n_vars < 1 || record->n_vars > n_vars)
887 sys_error (r, r->pos - 4,
888 _("Number of variables associated with a value label (%zu) "
889 "is not between 1 and the number of variables (%zu)."),
890 record->n_vars, n_vars);
891 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
892 for (i = 0; i < record->n_vars; i++)
893 record->vars[i] = read_int (r);
896 /* Reads a document record from R and returns it. */
897 static struct sfm_document_record *
898 read_document_record (struct sfm_reader *r)
900 struct sfm_document_record *record;
903 record = pool_malloc (r->pool, sizeof *record);
904 record->pos = r->pos;
906 n_lines = read_int (r);
907 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
908 sys_error (r, record->pos,
909 _("Number of document lines (%d) "
910 "must be greater than 0 and less than %d."),
911 n_lines, INT_MAX / DOC_LINE_LENGTH);
913 record->n_lines = n_lines;
914 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
915 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
921 read_extension_record_header (struct sfm_reader *r, int subtype,
922 struct sfm_extension_record *record)
924 record->subtype = subtype;
925 record->pos = r->pos;
926 record->size = read_int (r);
927 record->count = read_int (r);
929 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
930 allows an extra byte for a null terminator, used by some
931 extension processing routines. */
932 if (record->size != 0
933 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
934 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
937 /* Reads an extension record from R into RECORD. */
938 static struct sfm_extension_record *
939 read_extension_record (struct sfm_reader *r, int subtype)
941 struct extension_record_type
948 static const struct extension_record_type types[] =
950 /* Implemented record types. */
951 { EXT_INTEGER, 4, 8 },
953 { EXT_MRSETS, 1, 0 },
954 { EXT_PRODUCT_INFO, 1, 0 },
955 { EXT_DISPLAY, 4, 0 },
956 { EXT_LONG_NAMES, 1, 0 },
957 { EXT_LONG_STRINGS, 1, 0 },
958 { EXT_NCASES, 8, 2 },
959 { EXT_FILE_ATTRS, 1, 0 },
960 { EXT_VAR_ATTRS, 1, 0 },
961 { EXT_MRSETS2, 1, 0 },
962 { EXT_ENCODING, 1, 0 },
963 { EXT_LONG_LABELS, 1, 0 },
964 { EXT_LONG_MISSING, 1, 0 },
966 /* Ignored record types. */
967 { EXT_VAR_SETS, 0, 0 },
969 { EXT_DATA_ENTRY, 0, 0 },
970 { EXT_DATAVIEW, 0, 0 },
973 const struct extension_record_type *type;
974 struct sfm_extension_record *record;
977 record = pool_malloc (r->pool, sizeof *record);
978 read_extension_record_header (r, subtype, record);
979 n_bytes = record->count * record->size;
981 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
982 if (subtype == type->subtype)
984 if (type->size > 0 && record->size != type->size)
985 sys_warn (r, record->pos,
986 _("Record type 7, subtype %d has bad size %zu "
987 "(expected %d)."), subtype, record->size, type->size);
988 else if (type->count > 0 && record->count != type->count)
989 sys_warn (r, record->pos,
990 _("Record type 7, subtype %d has bad count %zu "
991 "(expected %d)."), subtype, record->count, type->count);
992 else if (type->count == 0 && type->size == 0)
994 /* Ignore this record. */
998 char *data = pool_malloc (r->pool, n_bytes + 1);
999 data[n_bytes] = '\0';
1001 record->data = data;
1002 read_bytes (r, record->data, n_bytes);
1009 sys_warn (r, record->pos,
1010 _("Unrecognized record type 7, subtype %d. Please send a "
1011 "copy of this file, and the syntax which created it to %s."),
1012 subtype, PACKAGE_BUGREPORT);
1015 skip_bytes (r, n_bytes);
1020 skip_extension_record (struct sfm_reader *r, int subtype)
1022 struct sfm_extension_record record;
1024 read_extension_record_header (r, subtype, &record);
1025 skip_bytes (r, record.count * record.size);
1029 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1030 struct sfm_read_info *info, struct dictionary *dict)
1032 const char *dict_encoding = dict_get_encoding (dict);
1033 struct substring product;
1034 struct substring label;
1037 /* Convert file label to UTF-8 and put it into DICT. */
1038 label = recode_substring_pool ("UTF-8", dict_encoding,
1039 ss_cstr (header->file_label), r->pool);
1040 ss_trim (&label, ss_cstr (" "));
1041 label.string[label.length] = '\0';
1042 fixed_label = fix_line_ends (label.string);
1043 dict_set_label (dict, fixed_label);
1046 /* Put creation date and time in UTF-8 into INFO. */
1047 info->creation_date = recode_string ("UTF-8", dict_encoding,
1048 header->creation_date, -1);
1049 info->creation_time = recode_string ("UTF-8", dict_encoding,
1050 header->creation_time, -1);
1052 /* Put product name into INFO, dropping eye-catcher string if present. */
1053 product = recode_substring_pool ("UTF-8", dict_encoding,
1054 ss_cstr (header->eye_catcher), r->pool);
1055 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1056 ss_trim (&product, ss_cstr (" "));
1057 info->product = ss_xstrdup (product);
1060 /* Reads a variable (type 2) record from R and adds the
1061 corresponding variable to DICT.
1062 Also skips past additional variable records for long string
1065 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1066 struct sfm_var_record *var_recs, size_t n_var_recs)
1068 const char *dict_encoding = dict_get_encoding (dict);
1069 struct sfm_var_record *rec;
1072 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1074 struct variable *var;
1079 name = recode_string_pool ("UTF-8", dict_encoding,
1080 rec->name, 8, r->pool);
1081 name[strcspn (name, " ")] = '\0';
1083 if (!dict_id_is_valid (dict, name, false)
1084 || name[0] == '$' || name[0] == '#')
1085 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1087 if (rec->width < 0 || rec->width > 255)
1088 sys_error (r, rec->pos,
1089 _("Bad width %d for variable %s."), rec->width, name);
1091 var = rec->var = dict_create_var (dict, name, rec->width);
1094 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1095 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1098 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1102 /* Set the short name the same as the long name. */
1103 var_set_short_name (var, 0, name);
1105 /* Get variable label, if any. */
1110 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1111 rec->label, -1, r->pool);
1112 var_set_label (var, utf8_label, false);
1115 /* Set missing values. */
1116 if (rec->missing_value_code != 0)
1118 int width = var_get_width (var);
1119 struct missing_values mv;
1121 mv_init_pool (r->pool, &mv, width);
1122 if (var_is_numeric (var))
1124 bool has_range = rec->missing_value_code < 0;
1125 int n_discrete = (has_range
1126 ? rec->missing_value_code == -3
1127 : rec->missing_value_code);
1132 double low = parse_float (r, rec->missing, 0);
1133 double high = parse_float (r, rec->missing, 8);
1135 /* Deal with SPSS 21 change in representation. */
1139 mv_add_range (&mv, low, high);
1143 for (i = 0; i < n_discrete; i++)
1145 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1153 value_init_pool (r->pool, &value, width);
1154 value_set_missing (&value, width);
1155 for (i = 0; i < rec->missing_value_code; i++)
1156 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1158 var_set_missing_values (var, &mv);
1162 parse_format_spec (r, rec->pos + 12, rec->print_format,
1163 PRINT_FORMAT, var, &n_warnings);
1164 parse_format_spec (r, rec->pos + 16, rec->write_format,
1165 WRITE_FORMAT, var, &n_warnings);
1167 /* Account for values.
1168 Skip long string continuation records, if any. */
1169 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1170 for (i = 1; i < n_values; i++)
1171 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1172 sys_error (r, rec->pos, _("Missing string continuation record."));
1177 /* Translates the format spec from sysfile format to internal
1180 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1181 enum which_format which, struct variable *v,
1184 const int max_warnings = 8;
1185 uint8_t raw_type = format >> 16;
1186 uint8_t w = format >> 8;
1195 ok = (fmt_from_io (raw_type, &f.type)
1196 && fmt_check_output (&f)
1197 && fmt_check_width_compat (&f, var_get_width (v)));
1202 if (which == PRINT_FORMAT)
1203 var_set_print_format (v, &f);
1205 var_set_write_format (v, &f);
1207 else if (format == 0)
1209 /* Actually observed in the wild. No point in warning about it. */
1211 else if (++*n_warnings <= max_warnings)
1213 if (which == PRINT_FORMAT)
1214 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1216 var_get_name (v), var_get_width (v), format);
1218 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1220 var_get_name (v), var_get_width (v), format);
1222 if (*n_warnings == max_warnings)
1223 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1228 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1232 for (p = record->documents;
1233 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1234 p += DOC_LINE_LENGTH)
1236 struct substring line;
1238 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1239 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1240 ss_rtrim (&line, ss_cstr (" "));
1241 line.string[line.length] = '\0';
1243 dict_add_document_line (dict, line.string, false);
1249 /* Parses record type 7, subtype 3. */
1251 parse_machine_integer_info (struct sfm_reader *r,
1252 const struct sfm_extension_record *record,
1253 struct sfm_read_info *info)
1255 int float_representation, expected_float_format;
1256 int integer_representation, expected_integer_format;
1258 /* Save version info. */
1259 info->version_major = parse_int (r, record->data, 0);
1260 info->version_minor = parse_int (r, record->data, 4);
1261 info->version_revision = parse_int (r, record->data, 8);
1263 /* Check floating point format. */
1264 float_representation = parse_int (r, record->data, 16);
1265 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1266 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1267 expected_float_format = 1;
1268 else if (r->float_format == FLOAT_Z_LONG)
1269 expected_float_format = 2;
1270 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1271 expected_float_format = 3;
1274 if (float_representation != expected_float_format)
1275 sys_error (r, record->pos, _("Floating-point representation indicated by "
1276 "system file (%d) differs from expected (%d)."),
1277 float_representation, expected_float_format);
1279 /* Check integer format. */
1280 integer_representation = parse_int (r, record->data, 24);
1281 if (r->integer_format == INTEGER_MSB_FIRST)
1282 expected_integer_format = 1;
1283 else if (r->integer_format == INTEGER_LSB_FIRST)
1284 expected_integer_format = 2;
1287 if (integer_representation != expected_integer_format)
1288 sys_warn (r, record->pos,
1289 _("Integer format indicated by system file (%d) "
1290 "differs from expected (%d)."),
1291 integer_representation, expected_integer_format);
1296 choose_encoding (struct sfm_reader *r,
1297 const struct sfm_header_record *header,
1298 const struct sfm_extension_record *ext_integer,
1299 const struct sfm_extension_record *ext_encoding)
1301 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1304 return ext_encoding->data;
1306 /* But EXT_INTEGER is better than nothing as a fallback. */
1309 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1310 const char *encoding;
1319 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1320 respectively. However, there are known to be many files in the wild
1321 with character code 2, yet have data which are clearly not ASCII.
1322 Therefore we ignore these values. */
1329 encoding = sys_get_encoding_from_codepage (codepage);
1330 if (encoding != NULL)
1336 /* If the file magic number is EBCDIC then its character data is too. */
1337 if (!strcmp (header->magic, EBCDIC_MAGIC))
1340 return locale_charset ();
1343 /* Parses record type 7, subtype 4. */
1345 parse_machine_float_info (struct sfm_reader *r,
1346 const struct sfm_extension_record *record)
1348 double sysmis = parse_float (r, record->data, 0);
1349 double highest = parse_float (r, record->data, 8);
1350 double lowest = parse_float (r, record->data, 16);
1352 if (sysmis != SYSMIS)
1353 sys_warn (r, record->pos,
1354 _("File specifies unexpected value %g (%a) as %s, "
1355 "instead of %g (%a)."),
1356 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1358 if (highest != HIGHEST)
1359 sys_warn (r, record->pos,
1360 _("File specifies unexpected value %g (%a) as %s, "
1361 "instead of %g (%a)."),
1362 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1364 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1365 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1366 appears in a context (missing values) where SYSMIS cannot. */
1367 if (lowest != LOWEST && lowest != SYSMIS)
1368 sys_warn (r, record->pos,
1369 _("File specifies unexpected value %g (%a) as %s, "
1370 "instead of %g (%a) or %g (%a)."),
1371 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1374 /* Parses record type 7, subtype 10. */
1376 parse_extra_product_info (struct sfm_reader *r,
1377 const struct sfm_extension_record *record,
1378 struct sfm_read_info *info)
1380 struct text_record *text;
1382 text = open_text_record (r, record, true);
1383 info->product_ext = fix_line_ends (text_get_all (text));
1384 close_text_record (r, text);
1387 /* Parses record type 7, subtype 7 or 19. */
1389 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1390 struct dictionary *dict)
1392 struct text_record *text;
1393 struct mrset *mrset;
1395 text = open_text_record (r, record, false);
1398 const char *counted = NULL;
1401 struct stringi_set var_names;
1402 size_t allocated_vars;
1406 /* Skip extra line feeds if present. */
1407 while (text_match (text, '\n'))
1410 mrset = xzalloc (sizeof *mrset);
1412 name = text_get_token (text, ss_cstr ("="), NULL);
1415 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1417 if (mrset->name[0] != '$')
1419 sys_warn (r, record->pos,
1420 _("`%s' does not begin with `$' at offset %zu "
1421 "in MRSETS record."), mrset->name, text_pos (text));
1425 if (text_match (text, 'C'))
1427 mrset->type = MRSET_MC;
1428 if (!text_match (text, ' '))
1430 sys_warn (r, record->pos,
1431 _("Missing space following `%c' at offset %zu "
1432 "in MRSETS record."), 'C', text_pos (text));
1436 else if (text_match (text, 'D'))
1438 mrset->type = MRSET_MD;
1439 mrset->cat_source = MRSET_VARLABELS;
1441 else if (text_match (text, 'E'))
1445 mrset->type = MRSET_MD;
1446 mrset->cat_source = MRSET_COUNTEDVALUES;
1447 if (!text_match (text, ' '))
1449 sys_warn (r, record->pos,
1450 _("Missing space following `%c' at offset %zu "
1451 "in MRSETS record."), 'E', text_pos (text));
1455 number = text_get_token (text, ss_cstr (" "), NULL);
1456 if (!strcmp (number, "11"))
1457 mrset->label_from_var_label = true;
1458 else if (strcmp (number, "1"))
1459 sys_warn (r, record->pos,
1460 _("Unexpected label source value `%s' following `E' "
1461 "at offset %zu in MRSETS record."),
1462 number, text_pos (text));
1466 sys_warn (r, record->pos,
1467 _("Missing `C', `D', or `E' at offset %zu "
1468 "in MRSETS record."),
1473 if (mrset->type == MRSET_MD)
1475 counted = text_parse_counted_string (r, text);
1476 if (counted == NULL)
1480 label = text_parse_counted_string (r, text);
1483 if (label[0] != '\0')
1484 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1486 stringi_set_init (&var_names);
1491 const char *raw_var_name;
1492 struct variable *var;
1495 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1496 if (raw_var_name == NULL)
1498 if (delimiter != '\n')
1499 sys_warn (r, record->pos,
1500 _("Missing new-line parsing variable names "
1501 "at offset %zu in MRSETS record."),
1505 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1507 var = dict_lookup_var (dict, var_name);
1513 if (!stringi_set_insert (&var_names, var_name))
1515 sys_warn (r, record->pos,
1516 _("Duplicate variable name %s "
1517 "at offset %zu in MRSETS record."),
1518 var_name, text_pos (text));
1524 if (mrset->label == NULL && mrset->label_from_var_label
1525 && var_has_label (var))
1526 mrset->label = xstrdup (var_get_label (var));
1529 && var_get_type (var) != var_get_type (mrset->vars[0]))
1531 sys_warn (r, record->pos,
1532 _("MRSET %s contains both string and "
1533 "numeric variables."), name);
1536 width = MIN (width, var_get_width (var));
1538 if (mrset->n_vars >= allocated_vars)
1539 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1540 sizeof *mrset->vars);
1541 mrset->vars[mrset->n_vars++] = var;
1543 while (delimiter != '\n');
1545 if (mrset->n_vars < 2)
1547 sys_warn (r, record->pos,
1548 _("MRSET %s has only %zu variables."), mrset->name,
1550 mrset_destroy (mrset);
1551 stringi_set_destroy (&var_names);
1555 if (mrset->type == MRSET_MD)
1557 mrset->width = width;
1558 value_init (&mrset->counted, width);
1560 mrset->counted.f = c_strtod (counted, NULL);
1562 value_copy_str_rpad (&mrset->counted, width,
1563 (const uint8_t *) counted, ' ');
1566 dict_add_mrset (dict, mrset);
1568 stringi_set_destroy (&var_names);
1570 mrset_destroy (mrset);
1571 close_text_record (r, text);
1574 /* Read record type 7, subtype 11, which specifies how variables
1575 should be displayed in GUI environments. */
1577 parse_display_parameters (struct sfm_reader *r,
1578 const struct sfm_extension_record *record,
1579 struct dictionary *dict)
1581 bool includes_width;
1582 bool warned = false;
1587 n_vars = dict_get_var_cnt (dict);
1588 if (record->count == 3 * n_vars)
1589 includes_width = true;
1590 else if (record->count == 2 * n_vars)
1591 includes_width = false;
1594 sys_warn (r, record->pos,
1595 _("Extension 11 has bad count %zu (for %zu variables)."),
1596 record->count, n_vars);
1601 for (i = 0; i < n_vars; ++i)
1603 struct variable *v = dict_get_var (dict, i);
1604 int measure, width, align;
1606 measure = parse_int (r, record->data, ofs);
1611 width = parse_int (r, record->data, ofs);
1617 align = parse_int (r, record->data, ofs);
1620 /* SPSS sometimes seems to set variables' measure to zero. */
1624 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1627 sys_warn (r, record->pos,
1628 _("Invalid variable display parameters for variable "
1629 "%zu (%s). Default parameters substituted."),
1630 i, var_get_name (v));
1635 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1636 : measure == 2 ? MEASURE_ORDINAL
1638 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1639 : align == 1 ? ALIGN_RIGHT
1642 /* Older versions (SPSS 9.0) sometimes set the display
1643 width to zero. This causes confusion in the GUI, so
1644 only set the width if it is nonzero. */
1646 var_set_display_width (v, width);
1651 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1652 const char *new_name)
1654 size_t n_short_names;
1658 /* Renaming a variable may clear its short names, but we
1659 want to retain them, so we save them and re-set them
1661 n_short_names = var_get_short_name_cnt (var);
1662 short_names = xnmalloc (n_short_names, sizeof *short_names);
1663 for (i = 0; i < n_short_names; i++)
1665 const char *s = var_get_short_name (var, i);
1666 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1669 /* Set long name. */
1670 dict_rename_var (dict, var, new_name);
1672 /* Restore short names. */
1673 for (i = 0; i < n_short_names; i++)
1675 var_set_short_name (var, i, short_names[i]);
1676 free (short_names[i]);
1681 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1682 to each short name. Modifies variable names in DICT accordingly. */
1684 parse_long_var_name_map (struct sfm_reader *r,
1685 const struct sfm_extension_record *record,
1686 struct dictionary *dict)
1688 struct text_record *text;
1689 struct variable *var;
1694 /* There are no long variable names. Use the short variable names,
1695 converted to lowercase, as the long variable names. */
1698 for (i = 0; i < dict_get_var_cnt (dict); i++)
1700 struct variable *var = dict_get_var (dict, i);
1703 new_name = utf8_to_lower (var_get_name (var));
1704 rename_var_and_save_short_names (dict, var, new_name);
1711 /* Rename each of the variables, one by one. (In a correctly constructed
1712 system file, this cannot create any intermediate duplicate variable names,
1713 because all of the new variable names are longer than any of the old
1714 variable names and thus there cannot be any overlaps.) */
1715 text = open_text_record (r, record, true);
1716 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1718 /* Validate long name. */
1719 if (!dict_id_is_valid (dict, long_name, false))
1721 sys_warn (r, record->pos,
1722 _("Long variable mapping from %s to invalid "
1723 "variable name `%s'."),
1724 var_get_name (var), long_name);
1728 /* Identify any duplicates. */
1729 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1730 && dict_lookup_var (dict, long_name) != NULL)
1732 sys_warn (r, record->pos,
1733 _("Duplicate long variable name `%s'."), long_name);
1737 rename_var_and_save_short_names (dict, var, long_name);
1739 close_text_record (r, text);
1742 /* Reads record type 7, subtype 14, which gives the real length
1743 of each very long string. Rearranges DICT accordingly. */
1745 parse_long_string_map (struct sfm_reader *r,
1746 const struct sfm_extension_record *record,
1747 struct dictionary *dict)
1749 struct text_record *text;
1750 struct variable *var;
1753 text = open_text_record (r, record, true);
1754 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1756 size_t idx = var_get_dict_index (var);
1762 length = strtol (length_s, NULL, 10);
1763 if (length < 1 || length > MAX_STRING)
1765 sys_warn (r, record->pos,
1766 _("%s listed as string of invalid length %s "
1767 "in very long string record."),
1768 var_get_name (var), length_s);
1772 /* Check segments. */
1773 segment_cnt = sfm_width_to_segments (length);
1774 if (segment_cnt == 1)
1776 sys_warn (r, record->pos,
1777 _("%s listed in very long string record with width %s, "
1778 "which requires only one segment."),
1779 var_get_name (var), length_s);
1782 if (idx + segment_cnt > dict_get_var_cnt (dict))
1783 sys_error (r, record->pos,
1784 _("Very long string %s overflows dictionary."),
1785 var_get_name (var));
1787 /* Get the short names from the segments and check their
1789 for (i = 0; i < segment_cnt; i++)
1791 struct variable *seg = dict_get_var (dict, idx + i);
1792 int alloc_width = sfm_segment_alloc_width (length, i);
1793 int width = var_get_width (seg);
1796 var_set_short_name (var, i, var_get_short_name (seg, 0));
1797 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1798 sys_error (r, record->pos,
1799 _("Very long string with width %ld has segment %d "
1800 "of width %d (expected %d)."),
1801 length, i, width, alloc_width);
1803 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1804 var_set_width (var, length);
1806 close_text_record (r, text);
1807 dict_compact_values (dict);
1811 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1812 const struct sfm_var_record *var_recs, size_t n_var_recs,
1813 const struct sfm_value_label_record *record)
1815 struct variable **vars;
1819 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
1820 for (i = 0; i < record->n_labels; i++)
1821 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1822 record->labels[i].label, -1,
1825 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1826 for (i = 0; i < record->n_vars; i++)
1827 vars[i] = lookup_var_by_index (r, record->pos,
1828 var_recs, n_var_recs, record->vars[i]);
1830 for (i = 1; i < record->n_vars; i++)
1831 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1832 sys_error (r, record->pos,
1833 _("Variables associated with value label are not all of "
1834 "identical type. Variable %s is %s, but variable "
1836 var_get_name (vars[0]),
1837 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1838 var_get_name (vars[i]),
1839 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1841 for (i = 0; i < record->n_vars; i++)
1843 struct variable *var = vars[i];
1847 width = var_get_width (var);
1849 sys_error (r, record->pos,
1850 _("Value labels may not be added to long string "
1851 "variables (e.g. %s) using records types 3 and 4."),
1852 var_get_name (var));
1854 for (j = 0; j < record->n_labels; j++)
1856 struct sfm_value_label *label = &record->labels[j];
1859 value_init (&value, width);
1861 value.f = parse_float (r, label->value, 0);
1863 memcpy (value_str_rw (&value, width), label->value, width);
1865 if (!var_add_value_label (var, &value, utf8_labels[j]))
1867 if (var_is_numeric (var))
1868 sys_warn (r, record->pos,
1869 _("Duplicate value label for %g on %s."),
1870 value.f, var_get_name (var));
1872 sys_warn (r, record->pos,
1873 _("Duplicate value label for `%.*s' on %s."),
1874 width, value_str (&value, width),
1875 var_get_name (var));
1878 value_destroy (&value, width);
1882 pool_free (r->pool, vars);
1883 for (i = 0; i < record->n_labels; i++)
1884 pool_free (r->pool, utf8_labels[i]);
1885 pool_free (r->pool, utf8_labels);
1888 static struct variable *
1889 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1890 const struct sfm_var_record *var_recs, size_t n_var_recs,
1893 const struct sfm_var_record *rec;
1895 if (idx < 1 || idx > n_var_recs)
1897 sys_error (r, offset,
1898 _("Variable index %d not in valid range 1...%zu."),
1903 rec = &var_recs[idx - 1];
1904 if (rec->var == NULL)
1906 sys_error (r, offset,
1907 _("Variable index %d refers to long string continuation."),
1915 /* Parses a set of custom attributes from TEXT into ATTRS.
1916 ATTRS may be a null pointer, in which case the attributes are
1917 read but discarded. */
1919 parse_attributes (struct sfm_reader *r, struct text_record *text,
1920 struct attrset *attrs)
1924 struct attribute *attr;
1928 /* Parse the key. */
1929 key = text_get_token (text, ss_cstr ("("), NULL);
1933 attr = attribute_create (key);
1934 for (index = 1; ; index++)
1936 /* Parse the value. */
1940 value = text_get_token (text, ss_cstr ("\n"), NULL);
1943 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1948 length = strlen (value);
1949 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1951 value[length - 1] = '\0';
1952 attribute_add_value (attr, value + 1);
1957 _("Attribute value %s[%d] is not quoted: %s."),
1959 attribute_add_value (attr, value);
1962 /* Was this the last value for this attribute? */
1963 if (text_match (text, ')'))
1967 attrset_add (attrs, attr);
1969 attribute_destroy (attr);
1971 while (!text_match (text, '/'));
1974 /* Reads record type 7, subtype 17, which lists custom
1975 attributes on the data file. */
1977 parse_data_file_attributes (struct sfm_reader *r,
1978 const struct sfm_extension_record *record,
1979 struct dictionary *dict)
1981 struct text_record *text = open_text_record (r, record, true);
1982 parse_attributes (r, text, dict_get_attributes (dict));
1983 close_text_record (r, text);
1986 /* Parses record type 7, subtype 18, which lists custom
1987 attributes on individual variables. */
1989 parse_variable_attributes (struct sfm_reader *r,
1990 const struct sfm_extension_record *record,
1991 struct dictionary *dict)
1993 struct text_record *text;
1994 struct variable *var;
1996 text = open_text_record (r, record, true);
1997 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1998 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1999 close_text_record (r, text);
2003 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2005 size_t n_warnings = 0;
2008 for (i = 0; i < dict_get_var_cnt (dict); i++)
2010 struct variable *var = dict_get_var (dict, i);
2011 struct attrset *attrs = var_get_attributes (var);
2012 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2015 int value = atoi (attribute_get_value (attr, 0));
2037 role = ROLE_PARTITION;
2046 if (n_warnings++ == 0)
2047 sys_warn (r, -1, _("Invalid role for variable %s."),
2048 var_get_name (var));
2051 var_set_role (var, role);
2056 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2061 check_overflow (struct sfm_reader *r,
2062 const struct sfm_extension_record *record,
2063 size_t ofs, size_t length)
2065 size_t end = record->size * record->count;
2066 if (length >= end || ofs + length > end)
2067 sys_error (r, record->pos + end,
2068 _("Extension record subtype %d ends unexpectedly."),
2073 parse_long_string_value_labels (struct sfm_reader *r,
2074 const struct sfm_extension_record *record,
2075 struct dictionary *dict)
2077 const char *dict_encoding = dict_get_encoding (dict);
2078 size_t end = record->size * record->count;
2085 struct variable *var;
2090 /* Parse variable name length. */
2091 check_overflow (r, record, ofs, 4);
2092 var_name_len = parse_int (r, record->data, ofs);
2095 /* Parse variable name, width, and number of labels. */
2096 check_overflow (r, record, ofs, var_name_len + 8);
2097 var_name = recode_string_pool ("UTF-8", dict_encoding,
2098 (const char *) record->data + ofs,
2099 var_name_len, r->pool);
2100 width = parse_int (r, record->data, ofs + var_name_len);
2101 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2102 ofs += var_name_len + 8;
2104 /* Look up 'var' and validate. */
2105 var = dict_lookup_var (dict, var_name);
2107 sys_warn (r, record->pos + ofs,
2108 _("Ignoring long string value label record for "
2109 "unknown variable %s."), var_name);
2110 else if (var_is_numeric (var))
2112 sys_warn (r, record->pos + ofs,
2113 _("Ignoring long string value label record for "
2114 "numeric variable %s."), var_name);
2117 else if (width != var_get_width (var))
2119 sys_warn (r, record->pos + ofs,
2120 _("Ignoring long string value label record for variable "
2121 "%s because the record's width (%d) does not match the "
2122 "variable's width (%d)."),
2123 var_name, width, var_get_width (var));
2128 value_init_pool (r->pool, &value, width);
2129 for (i = 0; i < n_labels; i++)
2131 size_t value_length, label_length;
2132 bool skip = var == NULL;
2134 /* Parse value length. */
2135 check_overflow (r, record, ofs, 4);
2136 value_length = parse_int (r, record->data, ofs);
2140 check_overflow (r, record, ofs, value_length);
2143 if (value_length == width)
2144 memcpy (value_str_rw (&value, width),
2145 (const uint8_t *) record->data + ofs, width);
2148 sys_warn (r, record->pos + ofs,
2149 _("Ignoring long string value label %zu for "
2150 "variable %s, with width %d, that has bad value "
2152 i, var_get_name (var), width, value_length);
2156 ofs += value_length;
2158 /* Parse label length. */
2159 check_overflow (r, record, ofs, 4);
2160 label_length = parse_int (r, record->data, ofs);
2164 check_overflow (r, record, ofs, label_length);
2169 label = recode_string_pool ("UTF-8", dict_encoding,
2170 (const char *) record->data + ofs,
2171 label_length, r->pool);
2172 if (!var_add_value_label (var, &value, label))
2173 sys_warn (r, record->pos + ofs,
2174 _("Duplicate value label for `%.*s' on %s."),
2175 width, value_str (&value, width),
2176 var_get_name (var));
2177 pool_free (r->pool, label);
2179 ofs += label_length;
2185 parse_long_string_missing_values (struct sfm_reader *r,
2186 const struct sfm_extension_record *record,
2187 struct dictionary *dict)
2189 const char *dict_encoding = dict_get_encoding (dict);
2190 size_t end = record->size * record->count;
2195 struct missing_values mv;
2197 struct variable *var;
2198 int n_missing_values;
2202 /* Parse variable name length. */
2203 check_overflow (r, record, ofs, 4);
2204 var_name_len = parse_int (r, record->data, ofs);
2207 /* Parse variable name. */
2208 check_overflow (r, record, ofs, var_name_len + 1);
2209 var_name = recode_string_pool ("UTF-8", dict_encoding,
2210 (const char *) record->data + ofs,
2211 var_name_len, r->pool);
2212 ofs += var_name_len;
2214 /* Parse number of missing values. */
2215 n_missing_values = ((const uint8_t *) record->data)[ofs];
2216 if (n_missing_values < 1 || n_missing_values > 3)
2217 sys_warn (r, record->pos + ofs,
2218 _("Long string missing values record says variable %s "
2219 "has %d missing values, but only 1 to 3 missing values "
2221 var_name, n_missing_values);
2224 /* Look up 'var' and validate. */
2225 var = dict_lookup_var (dict, var_name);
2227 sys_warn (r, record->pos + ofs,
2228 _("Ignoring long string missing value record for "
2229 "unknown variable %s."), var_name);
2230 else if (var_is_numeric (var))
2232 sys_warn (r, record->pos + ofs,
2233 _("Ignoring long string missing value record for "
2234 "numeric variable %s."), var_name);
2239 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2240 for (i = 0; i < n_missing_values; i++)
2242 size_t value_length;
2244 /* Parse value length. */
2245 check_overflow (r, record, ofs, 4);
2246 value_length = parse_int (r, record->data, ofs);
2250 check_overflow (r, record, ofs, value_length);
2253 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2255 sys_warn (r, record->pos + ofs,
2256 _("Ignoring long string missing value %zu for variable "
2257 "%s, with width %d, that has bad value width %zu."),
2258 i, var_get_name (var), var_get_width (var),
2260 ofs += value_length;
2263 var_set_missing_values (var, &mv);
2269 static void partial_record (struct sfm_reader *r)
2272 static void read_error (struct casereader *, const struct sfm_reader *);
2274 static bool read_case_number (struct sfm_reader *, double *);
2275 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2276 static int read_opcode (struct sfm_reader *);
2277 static bool read_compressed_number (struct sfm_reader *, double *);
2278 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2279 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2280 static bool skip_whole_strings (struct sfm_reader *, size_t);
2282 /* Reads and returns one case from READER's file. Returns a null
2283 pointer if not successful. */
2284 static struct ccase *
2285 sys_file_casereader_read (struct casereader *reader, void *r_)
2287 struct sfm_reader *r = r_;
2288 struct ccase *volatile c;
2294 c = case_create (r->proto);
2295 if (setjmp (r->bail_out))
2297 casereader_force_error (reader);
2302 for (i = 0; i < r->sfm_var_cnt; i++)
2304 struct sfm_var *sv = &r->sfm_vars[i];
2305 union value *v = case_data_rw_idx (c, sv->case_index);
2307 if (sv->var_width == 0)
2309 if (!read_case_number (r, &v->f))
2314 uint8_t *s = value_str_rw (v, sv->var_width);
2315 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2317 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2326 if (r->case_cnt != -1)
2327 read_error (reader, r);
2332 /* Issues an error that R ends in a partial record. */
2334 partial_record (struct sfm_reader *r)
2336 sys_error (r, r->pos, _("File ends in partial case."));
2339 /* Issues an error that an unspecified error occurred SFM, and
2342 read_error (struct casereader *r, const struct sfm_reader *sfm)
2344 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2345 casereader_force_error (r);
2348 /* Reads a number from R and stores its value in *D.
2349 If R is compressed, reads a compressed number;
2350 otherwise, reads a number in the regular way.
2351 Returns true if successful, false if end of file is
2352 reached immediately. */
2354 read_case_number (struct sfm_reader *r, double *d)
2356 if (r->compression == SFM_COMP_NONE)
2359 if (!try_read_bytes (r, number, sizeof number))
2361 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2365 return read_compressed_number (r, d);
2368 /* Reads LENGTH string bytes from R into S.
2369 Always reads a multiple of 8 bytes; if LENGTH is not a
2370 multiple of 8, then extra bytes are read and discarded without
2372 Reads compressed strings if S is compressed.
2373 Returns true if successful, false if end of file is
2374 reached immediately. */
2376 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2378 size_t whole = ROUND_DOWN (length, 8);
2379 size_t partial = length % 8;
2383 if (!read_whole_strings (r, s, whole))
2390 if (!read_whole_strings (r, bounce, sizeof bounce))
2396 memcpy (s + whole, bounce, partial);
2402 /* Reads and returns the next compression opcode from R. */
2404 read_opcode (struct sfm_reader *r)
2406 assert (r->compression != SFM_COMP_NONE);
2410 if (r->opcode_idx >= sizeof r->opcodes)
2412 if (!try_read_compressed_bytes (r, r->opcodes, sizeof r->opcodes))
2416 opcode = r->opcodes[r->opcode_idx++];
2423 /* Reads a compressed number from R and stores its value in D.
2424 Returns true if successful, false if end of file is
2425 reached immediately. */
2427 read_compressed_number (struct sfm_reader *r, double *d)
2429 int opcode = read_opcode (r);
2437 *d = read_compressed_float (r);
2441 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2442 if (!r->corruption_warning)
2444 r->corruption_warning = true;
2445 sys_warn (r, r->pos,
2446 _("Possible compressed data corruption: "
2447 "compressed spaces appear in numeric field."));
2456 *d = opcode - r->bias;
2463 /* Reads a compressed 8-byte string segment from R and stores it
2465 Returns true if successful, false if end of file is
2466 reached immediately. */
2468 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2470 int opcode = read_opcode (r);
2478 read_compressed_bytes (r, dst, 8);
2482 memset (dst, ' ', 8);
2487 double value = opcode - r->bias;
2488 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2491 /* This has actually been seen "in the wild". The submitter of the
2492 file that showed that the contents decoded as spaces, but they
2493 were at the end of the field so it's possible that the null
2494 bytes just acted as null terminators. */
2496 else if (!r->corruption_warning)
2498 r->corruption_warning = true;
2499 sys_warn (r, r->pos,
2500 _("Possible compressed data corruption: "
2501 "string contains compressed integer (opcode %d)."),
2511 /* Reads LENGTH string bytes from R into S.
2512 LENGTH must be a multiple of 8.
2513 Reads compressed strings if S is compressed.
2514 Returns true if successful, false if end of file is
2515 reached immediately. */
2517 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2519 assert (length % 8 == 0);
2520 if (r->compression == SFM_COMP_NONE)
2521 return try_read_bytes (r, s, length);
2525 for (ofs = 0; ofs < length; ofs += 8)
2526 if (!read_compressed_string (r, s + ofs))
2536 /* Skips LENGTH string bytes from R.
2537 LENGTH must be a multiple of 8.
2538 (LENGTH is also limited to 1024, but that's only because the
2539 current caller never needs more than that many bytes.)
2540 Returns true if successful, false if end of file is
2541 reached immediately. */
2543 skip_whole_strings (struct sfm_reader *r, size_t length)
2545 uint8_t buffer[1024];
2546 assert (length < sizeof buffer);
2547 return read_whole_strings (r, buffer, length);
2550 /* Helpers for reading records that contain structured text
2553 /* Maximum number of warnings to issue for a single text
2555 #define MAX_TEXT_WARNINGS 5
2560 struct substring buffer; /* Record contents. */
2561 off_t start; /* Starting offset in file. */
2562 size_t pos; /* Current position in buffer. */
2563 int n_warnings; /* Number of warnings issued or suppressed. */
2564 bool recoded; /* Recoded into UTF-8? */
2567 static struct text_record *
2568 open_text_record (struct sfm_reader *r,
2569 const struct sfm_extension_record *record,
2570 bool recode_to_utf8)
2572 struct text_record *text;
2573 struct substring raw;
2575 text = pool_alloc (r->pool, sizeof *text);
2576 raw = ss_buffer (record->data, record->size * record->count);
2577 text->start = record->pos;
2578 text->buffer = (recode_to_utf8
2579 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2582 text->n_warnings = 0;
2583 text->recoded = recode_to_utf8;
2588 /* Closes TEXT, frees its storage, and issues a final warning
2589 about suppressed warnings if necesary. */
2591 close_text_record (struct sfm_reader *r, struct text_record *text)
2593 if (text->n_warnings > MAX_TEXT_WARNINGS)
2594 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2595 text->n_warnings - MAX_TEXT_WARNINGS);
2597 pool_free (r->pool, ss_data (text->buffer));
2600 /* Reads a variable=value pair from TEXT.
2601 Looks up the variable in DICT and stores it into *VAR.
2602 Stores a null-terminated value into *VALUE. */
2604 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2605 struct text_record *text,
2606 struct variable **var, char **value)
2610 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2613 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2617 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2618 ss_buffer ("\t\0", 2));
2626 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2627 struct text_record *text, struct substring delimiters,
2628 struct variable **var)
2632 name = text_get_token (text, delimiters, NULL);
2636 *var = dict_lookup_var (dict, name);
2640 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2647 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2648 struct text_record *text, struct substring delimiters,
2649 struct variable **var)
2651 char *short_name = text_get_token (text, delimiters, NULL);
2652 if (short_name == NULL)
2655 *var = dict_lookup_var (dict, short_name);
2657 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2662 /* Displays a warning for the current file position, limiting the
2663 number to MAX_TEXT_WARNINGS for TEXT. */
2665 text_warn (struct sfm_reader *r, struct text_record *text,
2666 const char *format, ...)
2668 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2672 va_start (args, format);
2673 sys_msg (r, text->start + text->pos, MW, format, args);
2679 text_get_token (struct text_record *text, struct substring delimiters,
2682 struct substring token;
2685 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2688 end = &ss_data (token)[ss_length (token)];
2689 if (delimiter != NULL)
2692 return ss_data (token);
2695 /* Reads a integer value expressed in decimal, then a space, then a string that
2696 consists of exactly as many bytes as specified by the integer, then a space,
2697 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2698 buffer (so the caller should not free the string). */
2700 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2708 while (text->pos < text->buffer.length)
2710 int c = text->buffer.string[text->pos];
2711 if (c < '0' || c > '9')
2713 n = (n * 10) + (c - '0');
2716 if (text->pos >= text->buffer.length || start == text->pos)
2718 sys_warn (r, text->start,
2719 _("Expecting digit at offset %zu in MRSETS record."),
2724 if (!text_match (text, ' '))
2726 sys_warn (r, text->start,
2727 _("Expecting space at offset %zu in MRSETS record."),
2732 if (text->pos + n > text->buffer.length)
2734 sys_warn (r, text->start,
2735 _("%zu-byte string starting at offset %zu "
2736 "exceeds record length %zu."),
2737 n, text->pos, text->buffer.length);
2741 s = &text->buffer.string[text->pos];
2744 sys_warn (r, text->start,
2745 _("Expecting space at offset %zu following %zu-byte string."),
2755 text_match (struct text_record *text, char c)
2757 if (text->buffer.string[text->pos] == c)
2766 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2767 inside the TEXT's string. */
2769 text_pos (const struct text_record *text)
2775 text_get_all (const struct text_record *text)
2777 return text->buffer.string;
2782 /* Displays a corruption message. */
2784 sys_msg (struct sfm_reader *r, off_t offset,
2785 int class, const char *format, va_list args)
2790 ds_init_empty (&text);
2792 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2793 fh_get_file_name (r->fh), (long long int) offset);
2795 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2796 ds_put_vformat (&text, format, args);
2798 m.category = msg_class_to_category (class);
2799 m.severity = msg_class_to_severity (class);
2805 m.text = ds_cstr (&text);
2810 /* Displays a warning for offset OFFSET in the file. */
2812 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2816 va_start (args, format);
2817 sys_msg (r, offset, MW, format, args);
2821 /* Displays an error for the current file position,
2822 marks it as in an error state,
2823 and aborts reading it using longjmp. */
2825 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2829 va_start (args, format);
2830 sys_msg (r, offset, ME, format, args);
2834 longjmp (r->bail_out, 1);
2837 /* Reads BYTE_CNT bytes into BUF.
2838 Returns true if exactly BYTE_CNT bytes are successfully read.
2839 Aborts if an I/O error or a partial read occurs.
2840 If EOF_IS_OK, then an immediate end-of-file causes false to be
2841 returned; otherwise, immediate end-of-file causes an abort
2844 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2845 void *buf, size_t byte_cnt)
2847 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2848 r->pos += bytes_read;
2849 if (bytes_read == byte_cnt)
2851 else if (ferror (r->file))
2852 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2853 else if (!eof_is_ok || bytes_read != 0)
2854 sys_error (r, r->pos, _("Unexpected end of file."));
2859 /* Reads BYTE_CNT into BUF.
2860 Aborts upon I/O error or if end-of-file is encountered. */
2862 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2864 read_bytes_internal (r, false, buf, byte_cnt);
2867 /* Reads BYTE_CNT bytes into BUF.
2868 Returns true if exactly BYTE_CNT bytes are successfully read.
2869 Returns false if an immediate end-of-file is encountered.
2870 Aborts if an I/O error or a partial read occurs. */
2872 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2874 return read_bytes_internal (r, true, buf, byte_cnt);
2877 /* Reads a 32-bit signed integer from R and returns its value in
2880 read_int (struct sfm_reader *r)
2883 read_bytes (r, integer, sizeof integer);
2884 return integer_get (r->integer_format, integer, sizeof integer);
2887 /* Reads a 64-bit signed integer from R and returns its value in
2889 static long long int
2890 read_int64 (struct sfm_reader *r)
2893 read_bytes (r, integer, sizeof integer);
2894 return integer_get (r->integer_format, integer, sizeof integer);
2898 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2900 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2904 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2906 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2909 /* Reads exactly SIZE - 1 bytes into BUFFER
2910 and stores a null byte into BUFFER[SIZE - 1]. */
2912 read_string (struct sfm_reader *r, char *buffer, size_t size)
2915 read_bytes (r, buffer, size - 1);
2916 buffer[size - 1] = '\0';
2919 /* Skips BYTES bytes forward in R. */
2921 skip_bytes (struct sfm_reader *r, size_t bytes)
2926 size_t chunk = MIN (sizeof buffer, bytes);
2927 read_bytes (r, buffer, chunk);
2932 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
2933 been replaced by LFs.
2935 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
2936 files that use CR-only line ends in the file label and extra product
2939 fix_line_ends (const char *s)
2943 d = dst = xmalloc (strlen (s) + 1);
2962 read_ztrailer (struct sfm_reader *r,
2963 long long int zheader_ofs,
2964 long long int ztrailer_len);
2967 zalloc (voidpf pool_, uInt items, uInt size)
2969 struct pool *pool = pool_;
2971 return (!size || xalloc_oversized (items, size)
2973 : pool_malloc (pool, items * size));
2977 zfree (voidpf pool_, voidpf address)
2979 struct pool *pool = pool_;
2981 pool_free (pool, address);
2985 read_zheader (struct sfm_reader *r)
2988 long long int zheader_ofs = read_int64 (r);
2989 long long int ztrailer_ofs = read_int64 (r);
2990 long long int ztrailer_len = read_int64 (r);
2992 if (zheader_ofs != pos)
2993 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
2994 "(expected %#llx)."),
2995 zheader_ofs, (long long int) pos);
2997 if (ztrailer_ofs < r->pos)
2998 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3001 if (ztrailer_len < 24 || ztrailer_len % 24)
3002 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3004 r->ztrailer_ofs = ztrailer_ofs;
3005 read_ztrailer (r, zheader_ofs, ztrailer_len);
3007 if (r->zin_buf == NULL)
3009 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3010 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3011 r->zstream.next_in = NULL;
3012 r->zstream.avail_in = 0;
3015 r->zstream.zalloc = zalloc;
3016 r->zstream.zfree = zfree;
3017 r->zstream.opaque = r->pool;
3023 seek (struct sfm_reader *r, off_t offset)
3025 if (fseeko (r->file, offset, SEEK_SET))
3026 sys_error (r, 0, _("%s: seek failed (%s)."),
3027 fh_get_file_name (r->fh), strerror (errno));
3031 /* Performs some additional consistency checks on the ZLIB compressed data
3034 read_ztrailer (struct sfm_reader *r,
3035 long long int zheader_ofs,
3036 long long int ztrailer_len)
3038 long long int expected_uncmp_ofs;
3039 long long int expected_cmp_ofs;
3042 unsigned int block_size;
3043 unsigned int n_blocks;
3047 if (fstat (fileno (r->file), &s))
3048 sys_error (ME, 0, _("%s: stat failed (%s)."),
3049 fh_get_file_name (r->fh), strerror (errno));
3051 if (!S_ISREG (s.st_mode))
3053 /* We can't seek to the trailer and then back to the data in this file,
3054 so skip doing extra checks. */
3058 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3059 sys_warn (r, r->pos,
3060 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3061 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3063 seek (r, r->ztrailer_ofs);
3065 /* Read fixed header from ZLIB data trailer. */
3066 bias = read_int64 (r);
3067 if (-bias != r->bias)
3068 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3069 "file header bias (%.2f)."),
3072 zero = read_int64 (r);
3074 sys_warn (r, r->pos,
3075 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3077 block_size = read_int (r);
3078 if (block_size != ZBLOCK_SIZE)
3079 sys_warn (r, r->pos,
3080 _("ZLIB trailer specifies unexpected %u-byte block size."),
3083 n_blocks = read_int (r);
3084 if (n_blocks != (ztrailer_len - 24) / 24)
3085 sys_error (r, r->pos,
3086 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3088 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3090 expected_uncmp_ofs = zheader_ofs;
3091 expected_cmp_ofs = zheader_ofs + 24;
3092 for (i = 0; i < n_blocks; i++)
3094 off_t desc_ofs = r->pos;
3095 unsigned long long int uncompressed_ofs = read_int64 (r);
3096 unsigned long long int compressed_ofs = read_int64 (r);
3097 unsigned int uncompressed_size = read_int (r);
3098 unsigned int compressed_size = read_int (r);
3100 if (uncompressed_ofs != expected_uncmp_ofs)
3101 sys_error (r, desc_ofs,
3102 _("ZLIB block descriptor %u reported uncompressed data "
3103 "offset %#llx, when %#llx was expected."),
3104 i, uncompressed_ofs, expected_uncmp_ofs);
3106 if (compressed_ofs != expected_cmp_ofs)
3107 sys_error (r, desc_ofs,
3108 _("ZLIB block descriptor %u reported compressed data "
3109 "offset %#llx, when %#llx was expected."),
3110 i, compressed_ofs, expected_cmp_ofs);
3112 if (i < n_blocks - 1)
3114 if (uncompressed_size != block_size)
3115 sys_warn (r, desc_ofs,
3116 _("ZLIB block descriptor %u reported block size %#x, "
3117 "when %#x was expected."),
3118 i, uncompressed_size, block_size);
3122 if (uncompressed_size > block_size)
3123 sys_warn (r, desc_ofs,
3124 _("ZLIB block descriptor %u reported block size %#x, "
3125 "when at most %#x was expected."),
3126 i, uncompressed_size, block_size);
3129 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3130 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3131 This code checks for an expansion of more than 14.3% plus 11
3133 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3134 sys_error (r, desc_ofs,
3135 _("ZLIB block descriptor %u reports compressed size %u "
3136 "and uncompressed size %u."),
3137 i, compressed_size, uncompressed_size);
3139 expected_uncmp_ofs += uncompressed_size;
3140 expected_cmp_ofs += compressed_size;
3143 if (expected_cmp_ofs != r->ztrailer_ofs)
3144 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3145 "would be expected from block descriptors."),
3146 r->ztrailer_ofs, expected_cmp_ofs);
3148 seek (r, zheader_ofs + 24);
3152 open_zstream (struct sfm_reader *r)
3156 r->zout_pos = r->zout_end = 0;
3157 error = inflateInit (&r->zstream);
3159 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3164 close_zstream (struct sfm_reader *r)
3168 error = inflateEnd (&r->zstream);
3170 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3175 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3177 uint8_t *buf = buf_;
3186 /* Use already inflated data if there is any. */
3187 if (r->zout_pos < r->zout_end)
3189 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3190 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3199 /* We need to inflate some more data.
3200 Get some more input data if we don't have any. */
3201 if (r->zstream.avail_in == 0)
3203 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3204 if (n == 0 || !try_read_bytes (r, r->zin_buf, n))
3206 r->zstream.avail_in = n;
3207 r->zstream.next_in = r->zin_buf;
3210 /* Inflate the (remaining) input data. */
3211 r->zstream.avail_out = ZOUT_BUF_SIZE;
3212 r->zstream.next_out = r->zout_buf;
3213 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3215 r->zout_end = r->zstream.next_out - r->zout_buf;
3216 if (r->zout_end == 0)
3218 if (error == Z_STREAM_END)
3224 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3229 /* Process the output data and ignore 'error' for now. ZLIB will
3230 present it to us again on the next inflate() call. */
3236 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3238 if (r->compression == SFM_COMP_SIMPLE)
3239 return read_bytes (r, buf, byte_cnt);
3240 else if (!read_bytes_zlib (r, buf, byte_cnt))
3241 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3245 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3247 if (r->compression == SFM_COMP_SIMPLE)
3248 return try_read_bytes (r, buf, byte_cnt);
3250 return read_bytes_zlib (r, buf, byte_cnt);
3253 /* Reads a 64-bit floating-point number from R and returns its
3254 value in host format. */
3256 read_compressed_float (struct sfm_reader *r)
3259 read_compressed_bytes (r, number, sizeof number);
3260 return float_get_double (r->float_format, number);
3263 static const struct casereader_class sys_file_casereader_class =
3265 sys_file_casereader_read,
3266 sys_file_casereader_destroy,