1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
88 /* Fields from the top-level header record. */
89 struct sfm_header_record
91 char magic[5]; /* First 4 bytes of file, then null. */
92 int weight_idx; /* 0 if unweighted, otherwise a var index. */
93 int nominal_case_size; /* Number of var positions. */
95 /* These correspond to the members of struct sfm_file_info or a dictionary
96 but in the system file's encoding rather than ASCII. */
97 char creation_date[10]; /* "dd mmm yy". */
98 char creation_time[9]; /* "hh:mm:ss". */
99 char eye_catcher[61]; /* Eye-catcher string, then product name. */
100 char file_label[65]; /* File label. */
103 struct sfm_var_record
110 int missing_value_code;
113 struct variable *var;
116 struct sfm_value_label
122 struct sfm_value_label_record
125 struct sfm_value_label *labels;
132 struct sfm_document_record
139 struct sfm_extension_record
141 off_t pos; /* Starting offset in file. */
142 size_t size; /* Size of data elements. */
143 size_t count; /* Number of data elements. */
144 void *data; /* Contents. */
147 /* System file reader. */
150 /* Resource tracking. */
151 struct pool *pool; /* All system file state. */
152 jmp_buf bail_out; /* longjmp() target for error handling. */
155 struct file_handle *fh; /* File handle. */
156 struct fh_lock *lock; /* Mutual exclusion for file handle. */
157 FILE *file; /* File stream. */
158 off_t pos; /* Position in file. */
159 bool error; /* I/O or corruption error? */
160 struct caseproto *proto; /* Format of output cases. */
163 enum integer_format integer_format; /* On-disk integer format. */
164 enum float_format float_format; /* On-disk floating point format. */
165 struct sfm_var *sfm_vars; /* Variables. */
166 size_t sfm_var_cnt; /* Number of variables. */
167 casenumber case_cnt; /* Number of cases */
168 const char *encoding; /* String encoding. */
171 bool compressed; /* File is compressed? */
172 double bias; /* Compression bias, usually 100.0. */
173 uint8_t opcodes[8]; /* Current block of opcodes. */
174 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
175 bool corruption_warning; /* Warned about possible corruption? */
178 static const struct casereader_class sys_file_casereader_class;
180 static bool close_reader (struct sfm_reader *);
182 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
183 const struct sfm_var_record *,
186 static void sys_msg (struct sfm_reader *r, off_t, int class,
187 const char *format, va_list args)
188 PRINTF_FORMAT (4, 0);
189 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
190 PRINTF_FORMAT (3, 4);
191 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
195 static void read_bytes (struct sfm_reader *, void *, size_t);
196 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
197 static int read_int (struct sfm_reader *);
198 static double read_float (struct sfm_reader *);
199 static void read_string (struct sfm_reader *, char *, size_t);
200 static void skip_bytes (struct sfm_reader *, size_t);
202 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
203 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
205 static void read_variable_record (struct sfm_reader *,
206 struct sfm_var_record *);
207 static void read_value_label_record (struct sfm_reader *,
208 struct sfm_value_label_record *,
210 static struct sfm_document_record *read_document_record (struct sfm_reader *);
211 static struct sfm_extension_record *read_extension_record (
212 struct sfm_reader *, int subtype);
213 static void skip_extension_record (struct sfm_reader *, int subtype);
215 static const char *choose_encoding (
217 const struct sfm_header_record *,
218 const struct sfm_extension_record *ext_integer,
219 const struct sfm_extension_record *ext_encoding);
221 static struct text_record *open_text_record (
222 struct sfm_reader *, const struct sfm_extension_record *,
223 bool recode_to_utf8);
224 static void close_text_record (struct sfm_reader *,
225 struct text_record *);
226 static bool read_variable_to_value_pair (struct sfm_reader *,
228 struct text_record *,
229 struct variable **var, char **value);
230 static void text_warn (struct sfm_reader *r, struct text_record *text,
231 const char *format, ...)
232 PRINTF_FORMAT (3, 4);
233 static char *text_get_token (struct text_record *,
234 struct substring delimiters, char *delimiter);
235 static bool text_match (struct text_record *, char c);
236 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
237 struct text_record *,
238 struct substring delimiters,
240 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
241 struct text_record *,
242 struct substring delimiters,
244 static const char *text_parse_counted_string (struct sfm_reader *,
245 struct text_record *);
246 static size_t text_pos (const struct text_record *);
248 static bool close_reader (struct sfm_reader *r);
250 /* Dictionary reader. */
258 static void read_header (struct sfm_reader *, struct sfm_read_info *,
259 struct sfm_header_record *);
260 static void parse_header (struct sfm_reader *,
261 const struct sfm_header_record *,
262 struct sfm_read_info *, struct dictionary *);
263 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
264 struct sfm_var_record *, size_t n);
265 static void parse_format_spec (struct sfm_reader *, off_t pos,
266 unsigned int format, enum which_format,
267 struct variable *, int *format_warning_cnt);
268 static void parse_document (struct dictionary *, struct sfm_document_record *);
269 static void parse_display_parameters (struct sfm_reader *,
270 const struct sfm_extension_record *,
271 struct dictionary *);
272 static void parse_machine_integer_info (struct sfm_reader *,
273 const struct sfm_extension_record *,
274 struct sfm_read_info *);
275 static void parse_machine_float_info (struct sfm_reader *,
276 const struct sfm_extension_record *);
277 static void parse_mrsets (struct sfm_reader *,
278 const struct sfm_extension_record *,
279 struct dictionary *);
280 static void parse_long_var_name_map (struct sfm_reader *,
281 const struct sfm_extension_record *,
282 struct dictionary *);
283 static void parse_long_string_map (struct sfm_reader *,
284 const struct sfm_extension_record *,
285 struct dictionary *);
286 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
287 const struct sfm_var_record *,
289 const struct sfm_value_label_record *);
290 static void parse_data_file_attributes (struct sfm_reader *,
291 const struct sfm_extension_record *,
292 struct dictionary *);
293 static void parse_variable_attributes (struct sfm_reader *,
294 const struct sfm_extension_record *,
295 struct dictionary *);
296 static void parse_long_string_value_labels (struct sfm_reader *,
297 const struct sfm_extension_record *,
298 struct dictionary *);
300 /* Frees the strings inside INFO. */
302 sfm_read_info_destroy (struct sfm_read_info *info)
306 free (info->creation_date);
307 free (info->creation_time);
308 free (info->product);
312 /* Opens the system file designated by file handle FH for reading. Reads the
313 system file's dictionary into *DICT.
315 Ordinarily the reader attempts to automatically detect the character
316 encoding based on the file's contents. This isn't always possible,
317 especially for files written by old versions of SPSS or PSPP, so specifying
318 a nonnull ENCODING overrides the choice of character encoding.
320 If INFO is non-null, then it receives additional info about the system file,
321 which the caller must eventually free with sfm_read_info_destroy() when it
322 is no longer needed. */
324 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
325 struct dictionary **dictp, struct sfm_read_info *infop)
327 struct sfm_reader *volatile r = NULL;
328 struct sfm_read_info *volatile info;
330 struct sfm_header_record header;
332 struct sfm_var_record *vars;
333 size_t n_vars, allocated_vars;
335 struct sfm_value_label_record *labels;
336 size_t n_labels, allocated_labels;
338 struct sfm_document_record *document;
340 struct sfm_extension_record *extensions[32];
342 struct dictionary *volatile dict = NULL;
345 /* Create and initialize reader. */
346 r = pool_create_container (struct sfm_reader, pool);
352 r->opcode_idx = sizeof r->opcodes;
353 r->corruption_warning = false;
355 info = infop ? infop : xmalloc (sizeof *info);
356 memset (info, 0, sizeof *info);
358 /* TRANSLATORS: this fragment will be interpolated into
359 messages in fh_lock() that identify types of files. */
360 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
364 r->file = fn_open (fh_get_file_name (fh), "rb");
367 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
368 fh_get_file_name (r->fh), strerror (errno));
372 if (setjmp (r->bail_out))
376 read_header (r, info, &header);
379 n_vars = allocated_vars = 0;
382 n_labels = allocated_labels = 0;
386 memset (extensions, 0, sizeof extensions);
396 read_int (r); /* Skip filler. */
403 if (n_vars >= allocated_vars)
404 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
406 read_variable_record (r, &vars[n_vars++]);
410 if (n_labels >= allocated_labels)
411 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
413 read_value_label_record (r, &labels[n_labels++], n_vars);
417 /* A Type 4 record is always immediately after a type 3 record,
418 so the code for type 3 records reads the type 4 record too. */
419 sys_error (r, r->pos, _("Misplaced type 4 record."));
422 if (document != NULL)
423 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
424 document = read_document_record (r);
428 subtype = read_int (r);
429 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
432 _("Unrecognized record type 7, subtype %d. Please "
433 "send a copy of this file, and the syntax which "
434 "created it to %s."),
435 subtype, PACKAGE_BUGREPORT);
436 skip_extension_record (r, subtype);
438 else if (extensions[subtype] != NULL)
441 _("Record type 7, subtype %d found here has the same "
442 "type as the record found near offset 0x%llx. "
443 "Please send a copy of this file, and the syntax "
444 "which created it to %s."),
445 subtype, (long long int) extensions[subtype]->pos,
447 skip_extension_record (r, subtype);
450 extensions[subtype] = read_extension_record (r, subtype);
454 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
459 /* Now actually parse what we read.
461 First, figure out the correct character encoding, because this determines
462 how the rest of the header data is to be interpreted. */
463 dict = dict_create (encoding
465 : choose_encoding (r, &header, extensions[EXT_INTEGER],
466 extensions[EXT_ENCODING]));
467 r->encoding = dict_get_encoding (dict);
469 /* These records don't use variables at all. */
470 if (document != NULL)
471 parse_document (dict, document);
473 if (extensions[EXT_INTEGER] != NULL)
474 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
476 if (extensions[EXT_FLOAT] != NULL)
477 parse_machine_float_info (r, extensions[EXT_FLOAT]);
479 if (extensions[EXT_FILE_ATTRS] != NULL)
480 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
482 parse_header (r, &header, info, dict);
484 /* Parse the variable records, the basis of almost everything else. */
485 parse_variable_records (r, dict, vars, n_vars);
487 /* Parse value labels and the weight variable immediately after the variable
488 records. These records use indexes into var_recs[], so we must parse them
489 before those indexes become invalidated by very long string variables. */
490 for (i = 0; i < n_labels; i++)
491 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
492 if (header.weight_idx != 0)
494 struct variable *weight_var;
496 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
498 if (var_is_numeric (weight_var))
499 dict_set_weight (dict, weight_var);
501 sys_error (r, -1, _("Weighting variable must be numeric "
502 "(not string variable `%s')."),
503 var_get_name (weight_var));
506 if (extensions[EXT_DISPLAY] != NULL)
507 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
509 /* The following records use short names, so they need to be parsed before
510 parse_long_var_name_map() changes short names to long names. */
511 if (extensions[EXT_MRSETS] != NULL)
512 parse_mrsets (r, extensions[EXT_MRSETS], dict);
514 if (extensions[EXT_MRSETS2] != NULL)
515 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
517 if (extensions[EXT_LONG_STRINGS] != NULL)
518 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
520 /* Now rename variables to their long names. */
521 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
523 /* The following records use long names, so they need to follow renaming. */
524 if (extensions[EXT_VAR_ATTRS] != NULL)
525 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
527 if (extensions[EXT_LONG_LABELS] != NULL)
528 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
530 /* Warn if the actual amount of data per case differs from the
531 amount that the header claims. SPSS version 13 gets this
532 wrong when very long strings are involved, so don't warn in
534 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
535 && info->version_major != 13)
536 sys_warn (r, -1, _("File header claims %d variable positions but "
537 "%zu were read from file."),
538 header.nominal_case_size, n_vars);
540 /* Create an index of dictionary variable widths for
541 sfm_read_case to use. We cannot use the `struct variable's
542 from the dictionary we created, because the caller owns the
543 dictionary and may destroy or modify its variables. */
544 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
545 pool_register (r->pool, free, r->sfm_vars);
546 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
551 sfm_read_info_destroy (info);
555 return casereader_create_sequential
557 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
558 &sys_file_casereader_class, r);
563 sfm_read_info_destroy (info);
573 /* Closes a system file after we're done with it.
574 Returns true if an I/O error has occurred on READER, false
577 close_reader (struct sfm_reader *r)
586 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
588 msg (ME, _("Error closing system file `%s': %s."),
589 fh_get_file_name (r->fh), strerror (errno));
599 pool_destroy (r->pool);
604 /* Destroys READER. */
606 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
608 struct sfm_reader *r = r_;
612 /* Returns true if FILE is an SPSS system file,
615 sfm_detect (FILE *file)
619 if (fread (magic, 4, 1, file) != 1)
623 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
626 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
627 except for the string fields in *INFO, which parse_header() will initialize
628 later once the file's encoding is known. */
630 read_header (struct sfm_reader *r, struct sfm_read_info *info,
631 struct sfm_header_record *header)
633 uint8_t raw_layout_code[4];
636 read_string (r, header->magic, sizeof header->magic);
637 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
639 if (strcmp (ASCII_MAGIC, header->magic)
640 && strcmp (EBCDIC_MAGIC, header->magic))
641 sys_error (r, 0, _("This is not an SPSS system file."));
643 /* Identify integer format. */
644 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
645 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
647 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
649 || (r->integer_format != INTEGER_MSB_FIRST
650 && r->integer_format != INTEGER_LSB_FIRST))
651 sys_error (r, 64, _("This is not an SPSS system file."));
653 header->nominal_case_size = read_int (r);
654 if (header->nominal_case_size < 0
655 || header->nominal_case_size > INT_MAX / 16)
656 header->nominal_case_size = -1;
658 r->compressed = read_int (r) != 0;
660 header->weight_idx = read_int (r);
662 r->case_cnt = read_int (r);
663 if ( r->case_cnt > INT_MAX / 2)
666 /* Identify floating-point format and obtain compression bias. */
667 read_bytes (r, raw_bias, sizeof raw_bias);
668 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
670 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
672 if (memcmp (raw_bias, zero_bias, 8))
673 sys_warn (r, r->pos - 8,
674 _("Compression bias is not the usual "
675 "value of 100, or system file uses unrecognized "
676 "floating-point format."));
679 /* Some software is known to write all-zeros to this
680 field. Such software also writes floating-point
681 numbers in the format that we expect by default
682 (it seems that all software most likely does, in
683 reality), so don't warn in this case. */
686 if (r->integer_format == INTEGER_MSB_FIRST)
687 r->float_format = FLOAT_IEEE_DOUBLE_BE;
689 r->float_format = FLOAT_IEEE_DOUBLE_LE;
691 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
693 read_string (r, header->creation_date, sizeof header->creation_date);
694 read_string (r, header->creation_time, sizeof header->creation_time);
695 read_string (r, header->file_label, sizeof header->file_label);
698 info->integer_format = r->integer_format;
699 info->float_format = r->float_format;
700 info->compressed = r->compressed;
701 info->case_cnt = r->case_cnt;
704 /* Reads a variable (type 2) record from R into RECORD. */
706 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
708 int has_variable_label;
710 memset (record, 0, sizeof *record);
712 record->pos = r->pos;
713 record->width = read_int (r);
714 has_variable_label = read_int (r);
715 record->missing_value_code = read_int (r);
716 record->print_format = read_int (r);
717 record->write_format = read_int (r);
718 read_bytes (r, record->name, sizeof record->name);
720 if (has_variable_label == 1)
722 enum { MAX_LABEL_LEN = 255 };
723 size_t len, read_len;
727 /* Read up to MAX_LABEL_LEN bytes of label. */
728 read_len = MIN (MAX_LABEL_LEN, len);
729 record->label = pool_malloc (r->pool, read_len + 1);
730 read_string (r, record->label, read_len + 1);
732 /* Skip unread label bytes. */
733 skip_bytes (r, len - read_len);
735 /* Skip label padding up to multiple of 4 bytes. */
736 skip_bytes (r, ROUND_UP (len, 4) - len);
738 else if (has_variable_label != 0)
739 sys_error (r, record->pos,
740 _("Variable label indicator field is not 0 or 1."));
742 /* Set missing values. */
743 if (record->missing_value_code != 0)
745 int code = record->missing_value_code;
746 if (record->width == 0)
748 if (code < -3 || code > 3 || code == -1)
749 sys_error (r, record->pos,
750 _("Numeric missing value indicator field is not "
751 "-3, -2, 0, 1, 2, or 3."));
755 if (code < 1 || code > 3)
756 sys_error (r, record->pos,
757 _("String missing value indicator field is not "
761 read_bytes (r, record->missing, 8 * abs (code));
765 /* Reads value labels from R into RECORD. */
767 read_value_label_record (struct sfm_reader *r,
768 struct sfm_value_label_record *record,
773 /* Read type 3 record. */
774 record->pos = r->pos;
775 record->n_labels = read_int (r);
776 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
777 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
779 record->labels = pool_nmalloc (r->pool, record->n_labels,
780 sizeof *record->labels);
781 for (i = 0; i < record->n_labels; i++)
783 struct sfm_value_label *label = &record->labels[i];
784 unsigned char label_len;
787 read_bytes (r, label->value, sizeof label->value);
789 /* Read label length. */
790 read_bytes (r, &label_len, sizeof label_len);
791 padded_len = ROUND_UP (label_len + 1, 8);
793 /* Read label, padding. */
794 label->label = pool_malloc (r->pool, padded_len + 1);
795 read_bytes (r, label->label, padded_len - 1);
796 label->label[label_len] = '\0';
799 /* Read record type of type 4 record. */
800 if (read_int (r) != 4)
801 sys_error (r, r->pos - 4,
802 _("Variable index record (type 4) does not immediately "
803 "follow value label record (type 3) as it should."));
805 /* Read number of variables associated with value label from type 4
807 record->n_vars = read_int (r);
808 if (record->n_vars < 1 || record->n_vars > n_vars)
809 sys_error (r, r->pos - 4,
810 _("Number of variables associated with a value label (%zu) "
811 "is not between 1 and the number of variables (%zu)."),
812 record->n_vars, n_vars);
813 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
814 for (i = 0; i < record->n_vars; i++)
815 record->vars[i] = read_int (r);
818 /* Reads a document record from R and returns it. */
819 static struct sfm_document_record *
820 read_document_record (struct sfm_reader *r)
822 struct sfm_document_record *record;
825 record = pool_malloc (r->pool, sizeof *record);
826 record->pos = r->pos;
828 n_lines = read_int (r);
829 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
830 sys_error (r, record->pos,
831 _("Number of document lines (%d) "
832 "must be greater than 0 and less than %d."),
833 n_lines, INT_MAX / DOC_LINE_LENGTH);
835 record->n_lines = n_lines;
836 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
837 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
843 read_extension_record_header (struct sfm_reader *r, int subtype,
844 struct sfm_extension_record *record)
846 record->pos = r->pos;
847 record->size = read_int (r);
848 record->count = read_int (r);
850 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
851 allows an extra byte for a null terminator, used by some
852 extension processing routines. */
853 if (record->size != 0
854 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
855 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
858 /* Reads an extension record from R into RECORD. */
859 static struct sfm_extension_record *
860 read_extension_record (struct sfm_reader *r, int subtype)
862 struct extension_record_type
869 static const struct extension_record_type types[] =
871 /* Implemented record types. */
872 { EXT_INTEGER, 4, 8 },
874 { EXT_MRSETS, 1, 0 },
875 { EXT_DISPLAY, 4, 0 },
876 { EXT_LONG_NAMES, 1, 0 },
877 { EXT_LONG_STRINGS, 1, 0 },
878 { EXT_NCASES, 8, 2 },
879 { EXT_FILE_ATTRS, 1, 0 },
880 { EXT_VAR_ATTRS, 1, 0 },
881 { EXT_MRSETS2, 1, 0 },
882 { EXT_ENCODING, 1, 0 },
883 { EXT_LONG_LABELS, 1, 0 },
885 /* Ignored record types. */
886 { EXT_VAR_SETS, 0, 0 },
888 { EXT_DATA_ENTRY, 0, 0 },
891 const struct extension_record_type *type;
892 struct sfm_extension_record *record;
895 record = pool_malloc (r->pool, sizeof *record);
896 read_extension_record_header (r, subtype, record);
897 n_bytes = record->count * record->size;
899 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
900 if (subtype == type->subtype)
902 if (type->size > 0 && record->size != type->size)
903 sys_warn (r, record->pos,
904 _("Record type 7, subtype %d has bad size %zu "
905 "(expected %d)."), subtype, record->size, type->size);
906 else if (type->count > 0 && record->count != type->count)
907 sys_warn (r, record->pos,
908 _("Record type 7, subtype %d has bad count %zu "
909 "(expected %d)."), subtype, record->count, type->count);
910 else if (type->count == 0 && type->size == 0)
912 /* Ignore this record. */
916 char *data = pool_malloc (r->pool, n_bytes + 1);
917 data[n_bytes] = '\0';
920 read_bytes (r, record->data, n_bytes);
927 sys_warn (r, record->pos,
928 _("Unrecognized record type 7, subtype %d. Please send a "
929 "copy of this file, and the syntax which created it to %s."),
930 subtype, PACKAGE_BUGREPORT);
933 skip_bytes (r, n_bytes);
938 skip_extension_record (struct sfm_reader *r, int subtype)
940 struct sfm_extension_record record;
942 read_extension_record_header (r, subtype, &record);
943 skip_bytes (r, record.count * record.size);
947 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
948 struct sfm_read_info *info, struct dictionary *dict)
950 const char *dict_encoding = dict_get_encoding (dict);
951 struct substring product;
952 struct substring label;
954 /* Convert file label to UTF-8 and put it into DICT. */
955 label = recode_substring_pool ("UTF-8", dict_encoding,
956 ss_cstr (header->file_label), r->pool);
957 ss_trim (&label, ss_cstr (" "));
958 label.string[label.length] = '\0';
959 dict_set_label (dict, label.string);
961 /* Put creation date and time in UTF-8 into INFO. */
962 info->creation_date = recode_string ("UTF-8", dict_encoding,
963 header->creation_date, -1);
964 info->creation_time = recode_string ("UTF-8", dict_encoding,
965 header->creation_time, -1);
967 /* Put product name into INFO, dropping eye-catcher string if present. */
968 product = recode_substring_pool ("UTF-8", dict_encoding,
969 ss_cstr (header->eye_catcher), r->pool);
970 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
971 ss_trim (&product, ss_cstr (" "));
972 info->product = ss_xstrdup (product);
975 /* Reads a variable (type 2) record from R and adds the
976 corresponding variable to DICT.
977 Also skips past additional variable records for long string
980 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
981 struct sfm_var_record *var_recs, size_t n_var_recs)
983 const char *dict_encoding = dict_get_encoding (dict);
984 struct sfm_var_record *rec;
987 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
989 struct variable *var;
994 name = recode_string_pool ("UTF-8", dict_encoding,
995 rec->name, 8, r->pool);
996 name[strcspn (name, " ")] = '\0';
998 if (!dict_id_is_valid (dict, name, false)
999 || name[0] == '$' || name[0] == '#')
1000 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1002 if (rec->width < 0 || rec->width > 255)
1003 sys_error (r, rec->pos,
1004 _("Bad width %d for variable %s."), rec->width, name);
1006 var = rec->var = dict_create_var (dict, name, rec->width);
1008 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
1010 /* Set the short name the same as the long name. */
1011 var_set_short_name (var, 0, name);
1013 /* Get variable label, if any. */
1018 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1019 rec->label, -1, r->pool);
1020 var_set_label (var, utf8_label, false);
1023 /* Set missing values. */
1024 if (rec->missing_value_code != 0)
1026 int width = var_get_width (var);
1027 struct missing_values mv;
1029 mv_init_pool (r->pool, &mv, width);
1030 if (var_is_numeric (var))
1032 bool has_range = rec->missing_value_code < 0;
1033 int n_discrete = (has_range
1034 ? rec->missing_value_code == -3
1035 : rec->missing_value_code);
1040 double low = parse_float (r, rec->missing, 0);
1041 double high = parse_float (r, rec->missing, 8);
1042 mv_add_range (&mv, low, high);
1046 for (i = 0; i < n_discrete; i++)
1048 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1056 value_init_pool (r->pool, &value, width);
1057 value_set_missing (&value, width);
1058 for (i = 0; i < rec->missing_value_code; i++)
1060 uint8_t *s = value_str_rw (&value, width);
1061 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1062 mv_add_str (&mv, s);
1065 var_set_missing_values (var, &mv);
1069 parse_format_spec (r, rec->pos + 12, rec->print_format,
1070 PRINT_FORMAT, var, &n_warnings);
1071 parse_format_spec (r, rec->pos + 16, rec->write_format,
1072 WRITE_FORMAT, var, &n_warnings);
1074 /* Account for values.
1075 Skip long string continuation records, if any. */
1076 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1077 for (i = 1; i < n_values; i++)
1078 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1079 sys_error (r, rec->pos, _("Missing string continuation record."));
1084 /* Translates the format spec from sysfile format to internal
1087 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1088 enum which_format which, struct variable *v,
1091 const int max_warnings = 8;
1092 uint8_t raw_type = format >> 16;
1093 uint8_t w = format >> 8;
1102 ok = (fmt_from_io (raw_type, &f.type)
1103 && fmt_check_output (&f)
1104 && fmt_check_width_compat (&f, var_get_width (v)));
1109 if (which == PRINT_FORMAT)
1110 var_set_print_format (v, &f);
1112 var_set_write_format (v, &f);
1114 else if (format == 0)
1116 /* Actually observed in the wild. No point in warning about it. */
1118 else if (++*n_warnings <= max_warnings)
1120 if (which == PRINT_FORMAT)
1121 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1123 var_get_name (v), var_get_width (v), format);
1125 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1127 var_get_name (v), var_get_width (v), format);
1129 if (*n_warnings == max_warnings)
1130 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1135 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1139 for (p = record->documents;
1140 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1141 p += DOC_LINE_LENGTH)
1143 struct substring line;
1145 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1146 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1147 ss_rtrim (&line, ss_cstr (" "));
1148 line.string[line.length] = '\0';
1150 dict_add_document_line (dict, line.string, false);
1156 /* Parses record type 7, subtype 3. */
1158 parse_machine_integer_info (struct sfm_reader *r,
1159 const struct sfm_extension_record *record,
1160 struct sfm_read_info *info)
1162 int float_representation, expected_float_format;
1163 int integer_representation, expected_integer_format;
1165 /* Save version info. */
1166 info->version_major = parse_int (r, record->data, 0);
1167 info->version_minor = parse_int (r, record->data, 4);
1168 info->version_revision = parse_int (r, record->data, 8);
1170 /* Check floating point format. */
1171 float_representation = parse_int (r, record->data, 16);
1172 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1173 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1174 expected_float_format = 1;
1175 else if (r->float_format == FLOAT_Z_LONG)
1176 expected_float_format = 2;
1177 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1178 expected_float_format = 3;
1181 if (float_representation != expected_float_format)
1182 sys_error (r, record->pos, _("Floating-point representation indicated by "
1183 "system file (%d) differs from expected (%d)."),
1184 float_representation, expected_float_format);
1186 /* Check integer format. */
1187 integer_representation = parse_int (r, record->data, 24);
1188 if (r->integer_format == INTEGER_MSB_FIRST)
1189 expected_integer_format = 1;
1190 else if (r->integer_format == INTEGER_LSB_FIRST)
1191 expected_integer_format = 2;
1194 if (integer_representation != expected_integer_format)
1195 sys_warn (r, record->pos,
1196 _("Integer format indicated by system file (%d) "
1197 "differs from expected (%d)."),
1198 integer_representation, expected_integer_format);
1203 choose_encoding (struct sfm_reader *r,
1204 const struct sfm_header_record *header,
1205 const struct sfm_extension_record *ext_integer,
1206 const struct sfm_extension_record *ext_encoding)
1208 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1211 return ext_encoding->data;
1213 /* But EXT_INTEGER is better than nothing as a fallback. */
1216 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1217 const char *encoding;
1226 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1227 respectively. However, there are known to be many files in the wild
1228 with character code 2, yet have data which are clearly not ASCII.
1229 Therefore we ignore these values. */
1236 encoding = sys_get_encoding_from_codepage (codepage);
1237 if (encoding != NULL)
1243 /* If the file magic number is EBCDIC then its character data is too. */
1244 if (!strcmp (header->magic, EBCDIC_MAGIC))
1247 return locale_charset ();
1250 /* Parses record type 7, subtype 4. */
1252 parse_machine_float_info (struct sfm_reader *r,
1253 const struct sfm_extension_record *record)
1255 double sysmis = parse_float (r, record->data, 0);
1256 double highest = parse_float (r, record->data, 8);
1257 double lowest = parse_float (r, record->data, 16);
1259 if (sysmis != SYSMIS)
1260 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1263 if (highest != HIGHEST)
1264 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1265 highest, "HIGHEST");
1267 if (lowest != LOWEST)
1268 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1272 /* Parses record type 7, subtype 7 or 19. */
1274 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1275 struct dictionary *dict)
1277 struct text_record *text;
1278 struct mrset *mrset;
1280 text = open_text_record (r, record, false);
1283 const char *counted = NULL;
1286 struct stringi_set var_names;
1287 size_t allocated_vars;
1291 mrset = xzalloc (sizeof *mrset);
1293 name = text_get_token (text, ss_cstr ("="), NULL);
1296 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1298 if (mrset->name[0] != '$')
1300 sys_warn (r, record->pos,
1301 _("`%s' does not begin with `$' at offset %zu "
1302 "in MRSETS record."), mrset->name, text_pos (text));
1306 if (text_match (text, 'C'))
1308 mrset->type = MRSET_MC;
1309 if (!text_match (text, ' '))
1311 sys_warn (r, record->pos,
1312 _("Missing space following `%c' at offset %zu "
1313 "in MRSETS record."), 'C', text_pos (text));
1317 else if (text_match (text, 'D'))
1319 mrset->type = MRSET_MD;
1320 mrset->cat_source = MRSET_VARLABELS;
1322 else if (text_match (text, 'E'))
1326 mrset->type = MRSET_MD;
1327 mrset->cat_source = MRSET_COUNTEDVALUES;
1328 if (!text_match (text, ' '))
1330 sys_warn (r, record->pos,
1331 _("Missing space following `%c' at offset %zu "
1332 "in MRSETS record."), 'E', text_pos (text));
1336 number = text_get_token (text, ss_cstr (" "), NULL);
1337 if (!strcmp (number, "11"))
1338 mrset->label_from_var_label = true;
1339 else if (strcmp (number, "1"))
1340 sys_warn (r, record->pos,
1341 _("Unexpected label source value `%s' following `E' "
1342 "at offset %zu in MRSETS record."),
1343 number, text_pos (text));
1347 sys_warn (r, record->pos,
1348 _("Missing `C', `D', or `E' at offset %zu "
1349 "in MRSETS record."),
1354 if (mrset->type == MRSET_MD)
1356 counted = text_parse_counted_string (r, text);
1357 if (counted == NULL)
1361 label = text_parse_counted_string (r, text);
1364 if (label[0] != '\0')
1365 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1367 stringi_set_init (&var_names);
1372 const char *raw_var_name;
1373 struct variable *var;
1376 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1377 if (raw_var_name == NULL)
1379 sys_warn (r, record->pos,
1380 _("Missing new-line parsing variable names "
1381 "at offset %zu in MRSETS record."),
1385 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1387 var = dict_lookup_var (dict, var_name);
1393 if (!stringi_set_insert (&var_names, var_name))
1395 sys_warn (r, record->pos,
1396 _("Duplicate variable name %s "
1397 "at offset %zu in MRSETS record."),
1398 var_name, text_pos (text));
1404 if (mrset->label == NULL && mrset->label_from_var_label
1405 && var_has_label (var))
1406 mrset->label = xstrdup (var_get_label (var));
1409 && var_get_type (var) != var_get_type (mrset->vars[0]))
1411 sys_warn (r, record->pos,
1412 _("MRSET %s contains both string and "
1413 "numeric variables."), name);
1416 width = MIN (width, var_get_width (var));
1418 if (mrset->n_vars >= allocated_vars)
1419 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1420 sizeof *mrset->vars);
1421 mrset->vars[mrset->n_vars++] = var;
1423 while (delimiter != '\n');
1425 if (mrset->n_vars < 2)
1427 sys_warn (r, record->pos,
1428 _("MRSET %s has only %zu variables."), mrset->name,
1430 mrset_destroy (mrset);
1431 stringi_set_destroy (&var_names);
1435 if (mrset->type == MRSET_MD)
1437 mrset->width = width;
1438 value_init (&mrset->counted, width);
1440 mrset->counted.f = strtod (counted, NULL);
1442 value_copy_str_rpad (&mrset->counted, width,
1443 (const uint8_t *) counted, ' ');
1446 dict_add_mrset (dict, mrset);
1448 stringi_set_destroy (&var_names);
1450 mrset_destroy (mrset);
1451 close_text_record (r, text);
1454 /* Read record type 7, subtype 11, which specifies how variables
1455 should be displayed in GUI environments. */
1457 parse_display_parameters (struct sfm_reader *r,
1458 const struct sfm_extension_record *record,
1459 struct dictionary *dict)
1461 bool includes_width;
1462 bool warned = false;
1467 n_vars = dict_get_var_cnt (dict);
1468 if (record->count == 3 * n_vars)
1469 includes_width = true;
1470 else if (record->count == 2 * n_vars)
1471 includes_width = false;
1474 sys_warn (r, record->pos,
1475 _("Extension 11 has bad count %zu (for %zu variables)."),
1476 record->count, n_vars);
1481 for (i = 0; i < n_vars; ++i)
1483 struct variable *v = dict_get_var (dict, i);
1484 int measure, width, align;
1486 measure = parse_int (r, record->data, ofs);
1491 width = parse_int (r, record->data, ofs);
1497 align = parse_int (r, record->data, ofs);
1500 /* SPSS 14 sometimes seems to set string variables' measure
1502 if (0 == measure && var_is_alpha (v))
1505 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1508 sys_warn (r, record->pos,
1509 _("Invalid variable display parameters for variable "
1510 "%zu (%s). Default parameters substituted."),
1511 i, var_get_name (v));
1516 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1517 : measure == 2 ? MEASURE_ORDINAL
1519 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1520 : align == 1 ? ALIGN_RIGHT
1523 /* Older versions (SPSS 9.0) sometimes set the display
1524 width to zero. This causes confusion in the GUI, so
1525 only set the width if it is nonzero. */
1527 var_set_display_width (v, width);
1532 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1533 const char *new_name)
1535 size_t n_short_names;
1539 /* Renaming a variable may clear its short names, but we
1540 want to retain them, so we save them and re-set them
1542 n_short_names = var_get_short_name_cnt (var);
1543 short_names = xnmalloc (n_short_names, sizeof *short_names);
1544 for (i = 0; i < n_short_names; i++)
1546 const char *s = var_get_short_name (var, i);
1547 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1550 /* Set long name. */
1551 dict_rename_var (dict, var, new_name);
1553 /* Restore short names. */
1554 for (i = 0; i < n_short_names; i++)
1556 var_set_short_name (var, i, short_names[i]);
1557 free (short_names[i]);
1562 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1563 to each short name. Modifies variable names in DICT accordingly. */
1565 parse_long_var_name_map (struct sfm_reader *r,
1566 const struct sfm_extension_record *record,
1567 struct dictionary *dict)
1569 struct text_record *text;
1570 struct variable *var;
1575 /* Convert variable names to lowercase. */
1578 for (i = 0; i < dict_get_var_cnt (dict); i++)
1580 struct variable *var = dict_get_var (dict, i);
1583 new_name = xstrdup (var_get_name (var));
1584 str_lowercase (new_name);
1586 rename_var_and_save_short_names (dict, var, new_name);
1594 /* Rename each of the variables, one by one. (In a correctly constructed
1595 system file, this cannot create any intermediate duplicate variable names,
1596 because all of the new variable names are longer than any of the old
1597 variable names and thus there cannot be any overlaps.) */
1598 text = open_text_record (r, record, true);
1599 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1601 /* Validate long name. */
1602 if (!dict_id_is_valid (dict, long_name, false))
1604 sys_warn (r, record->pos,
1605 _("Long variable mapping from %s to invalid "
1606 "variable name `%s'."),
1607 var_get_name (var), long_name);
1611 /* Identify any duplicates. */
1612 if (strcasecmp (var_get_short_name (var, 0), long_name)
1613 && dict_lookup_var (dict, long_name) != NULL)
1615 sys_warn (r, record->pos,
1616 _("Duplicate long variable name `%s'."), long_name);
1620 rename_var_and_save_short_names (dict, var, long_name);
1622 close_text_record (r, text);
1625 /* Reads record type 7, subtype 14, which gives the real length
1626 of each very long string. Rearranges DICT accordingly. */
1628 parse_long_string_map (struct sfm_reader *r,
1629 const struct sfm_extension_record *record,
1630 struct dictionary *dict)
1632 struct text_record *text;
1633 struct variable *var;
1636 text = open_text_record (r, record, true);
1637 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1639 size_t idx = var_get_dict_index (var);
1645 length = strtol (length_s, NULL, 10);
1646 if (length < 1 || length > MAX_STRING)
1648 sys_warn (r, record->pos,
1649 _("%s listed as string of invalid length %s "
1650 "in very long string record."),
1651 var_get_name (var), length_s);
1655 /* Check segments. */
1656 segment_cnt = sfm_width_to_segments (length);
1657 if (segment_cnt == 1)
1659 sys_warn (r, record->pos,
1660 _("%s listed in very long string record with width %s, "
1661 "which requires only one segment."),
1662 var_get_name (var), length_s);
1665 if (idx + segment_cnt > dict_get_var_cnt (dict))
1666 sys_error (r, record->pos,
1667 _("Very long string %s overflows dictionary."),
1668 var_get_name (var));
1670 /* Get the short names from the segments and check their
1672 for (i = 0; i < segment_cnt; i++)
1674 struct variable *seg = dict_get_var (dict, idx + i);
1675 int alloc_width = sfm_segment_alloc_width (length, i);
1676 int width = var_get_width (seg);
1679 var_set_short_name (var, i, var_get_short_name (seg, 0));
1680 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1681 sys_error (r, record->pos,
1682 _("Very long string with width %ld has segment %d "
1683 "of width %d (expected %d)."),
1684 length, i, width, alloc_width);
1686 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1687 var_set_width (var, length);
1689 close_text_record (r, text);
1690 dict_compact_values (dict);
1694 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1695 const struct sfm_var_record *var_recs, size_t n_var_recs,
1696 const struct sfm_value_label_record *record)
1698 struct variable **vars;
1702 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1703 for (i = 0; i < record->n_labels; i++)
1704 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1705 record->labels[i].label, -1,
1708 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1709 for (i = 0; i < record->n_vars; i++)
1710 vars[i] = lookup_var_by_index (r, record->pos,
1711 var_recs, n_var_recs, record->vars[i]);
1713 for (i = 1; i < record->n_vars; i++)
1714 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1715 sys_error (r, record->pos,
1716 _("Variables associated with value label are not all of "
1717 "identical type. Variable %s is %s, but variable "
1719 var_get_name (vars[0]),
1720 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1721 var_get_name (vars[i]),
1722 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1724 for (i = 0; i < record->n_vars; i++)
1726 struct variable *var = vars[i];
1730 width = var_get_width (var);
1732 sys_error (r, record->pos,
1733 _("Value labels may not be added to long string "
1734 "variables (e.g. %s) using records types 3 and 4."),
1735 var_get_name (var));
1737 for (j = 0; j < record->n_labels; j++)
1739 struct sfm_value_label *label = &record->labels[j];
1742 value_init (&value, width);
1744 value.f = parse_float (r, label->value, 0);
1746 memcpy (value_str_rw (&value, width), label->value, width);
1748 if (!var_add_value_label (var, &value, utf8_labels[j]))
1750 if (var_is_numeric (var))
1751 sys_warn (r, record->pos,
1752 _("Duplicate value label for %g on %s."),
1753 value.f, var_get_name (var));
1755 sys_warn (r, record->pos,
1756 _("Duplicate value label for `%.*s' on %s."),
1757 width, value_str (&value, width),
1758 var_get_name (var));
1761 value_destroy (&value, width);
1765 pool_free (r->pool, vars);
1766 for (i = 0; i < record->n_labels; i++)
1767 pool_free (r->pool, utf8_labels[i]);
1768 pool_free (r->pool, utf8_labels);
1771 static struct variable *
1772 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1773 const struct sfm_var_record *var_recs, size_t n_var_recs,
1776 const struct sfm_var_record *rec;
1778 if (idx < 1 || idx > n_var_recs)
1780 sys_error (r, offset,
1781 _("Variable index %d not in valid range 1...%zu."),
1786 rec = &var_recs[idx - 1];
1787 if (rec->var == NULL)
1789 sys_error (r, offset,
1790 _("Variable index %d refers to long string continuation."),
1798 /* Parses a set of custom attributes from TEXT into ATTRS.
1799 ATTRS may be a null pointer, in which case the attributes are
1800 read but discarded. */
1802 parse_attributes (struct sfm_reader *r, struct text_record *text,
1803 struct attrset *attrs)
1807 struct attribute *attr;
1811 /* Parse the key. */
1812 key = text_get_token (text, ss_cstr ("("), NULL);
1816 attr = attribute_create (key);
1817 for (index = 1; ; index++)
1819 /* Parse the value. */
1823 value = text_get_token (text, ss_cstr ("\n"), NULL);
1826 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1831 length = strlen (value);
1832 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1834 value[length - 1] = '\0';
1835 attribute_add_value (attr, value + 1);
1840 _("Attribute value %s[%d] is not quoted: %s."),
1842 attribute_add_value (attr, value);
1845 /* Was this the last value for this attribute? */
1846 if (text_match (text, ')'))
1850 attrset_add (attrs, attr);
1852 attribute_destroy (attr);
1854 while (!text_match (text, '/'));
1857 /* Reads record type 7, subtype 17, which lists custom
1858 attributes on the data file. */
1860 parse_data_file_attributes (struct sfm_reader *r,
1861 const struct sfm_extension_record *record,
1862 struct dictionary *dict)
1864 struct text_record *text = open_text_record (r, record, true);
1865 parse_attributes (r, text, dict_get_attributes (dict));
1866 close_text_record (r, text);
1869 /* Parses record type 7, subtype 18, which lists custom
1870 attributes on individual variables. */
1872 parse_variable_attributes (struct sfm_reader *r,
1873 const struct sfm_extension_record *record,
1874 struct dictionary *dict)
1876 struct text_record *text;
1877 struct variable *var;
1879 text = open_text_record (r, record, true);
1880 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1881 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1882 close_text_record (r, text);
1886 check_overflow (struct sfm_reader *r,
1887 const struct sfm_extension_record *record,
1888 size_t ofs, size_t length)
1890 size_t end = record->size * record->count;
1891 if (length >= end || ofs + length > end)
1892 sys_error (r, record->pos + end,
1893 _("Long string value label record ends unexpectedly."));
1897 parse_long_string_value_labels (struct sfm_reader *r,
1898 const struct sfm_extension_record *record,
1899 struct dictionary *dict)
1901 const char *dict_encoding = dict_get_encoding (dict);
1902 size_t end = record->size * record->count;
1909 struct variable *var;
1914 /* Parse variable name length. */
1915 check_overflow (r, record, ofs, 4);
1916 var_name_len = parse_int (r, record->data, ofs);
1919 /* Parse variable name, width, and number of labels. */
1920 check_overflow (r, record, ofs, var_name_len + 8);
1921 var_name = recode_string_pool ("UTF-8", dict_encoding,
1922 (const char *) record->data + ofs,
1923 var_name_len, r->pool);
1924 width = parse_int (r, record->data, ofs + var_name_len);
1925 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1926 ofs += var_name_len + 8;
1928 /* Look up 'var' and validate. */
1929 var = dict_lookup_var (dict, var_name);
1931 sys_warn (r, record->pos + ofs,
1932 _("Ignoring long string value record for "
1933 "unknown variable %s."), var_name);
1934 else if (var_is_numeric (var))
1936 sys_warn (r, record->pos + ofs,
1937 _("Ignoring long string value record for "
1938 "numeric variable %s."), var_name);
1941 else if (width != var_get_width (var))
1943 sys_warn (r, record->pos + ofs,
1944 _("Ignoring long string value record for variable %s "
1945 "because the record's width (%d) does not match the "
1946 "variable's width (%d)."),
1947 var_name, width, var_get_width (var));
1952 value_init_pool (r->pool, &value, width);
1953 for (i = 0; i < n_labels; i++)
1955 size_t value_length, label_length;
1956 bool skip = var == NULL;
1958 /* Parse value length. */
1959 check_overflow (r, record, ofs, 4);
1960 value_length = parse_int (r, record->data, ofs);
1964 check_overflow (r, record, ofs, value_length);
1967 if (value_length == width)
1968 memcpy (value_str_rw (&value, width),
1969 (const uint8_t *) record->data + ofs, width);
1972 sys_warn (r, record->pos + ofs,
1973 _("Ignoring long string value %zu for variable "
1974 "%s, with width %d, that has bad value "
1976 i, var_get_name (var), width, value_length);
1980 ofs += value_length;
1982 /* Parse label length. */
1983 check_overflow (r, record, ofs, 4);
1984 label_length = parse_int (r, record->data, ofs);
1988 check_overflow (r, record, ofs, label_length);
1993 label = recode_string_pool ("UTF-8", dict_encoding,
1994 (const char *) record->data + ofs,
1995 label_length, r->pool);
1996 if (!var_add_value_label (var, &value, label))
1997 sys_warn (r, record->pos + ofs,
1998 _("Duplicate value label for `%.*s' on %s."),
1999 width, value_str (&value, width),
2000 var_get_name (var));
2001 pool_free (r->pool, label);
2003 ofs += label_length;
2010 static void partial_record (struct sfm_reader *r)
2013 static void read_error (struct casereader *, const struct sfm_reader *);
2015 static bool read_case_number (struct sfm_reader *, double *);
2016 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2017 static int read_opcode (struct sfm_reader *);
2018 static bool read_compressed_number (struct sfm_reader *, double *);
2019 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2020 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2021 static bool skip_whole_strings (struct sfm_reader *, size_t);
2023 /* Reads and returns one case from READER's file. Returns a null
2024 pointer if not successful. */
2025 static struct ccase *
2026 sys_file_casereader_read (struct casereader *reader, void *r_)
2028 struct sfm_reader *r = r_;
2029 struct ccase *volatile c;
2035 c = case_create (r->proto);
2036 if (setjmp (r->bail_out))
2038 casereader_force_error (reader);
2043 for (i = 0; i < r->sfm_var_cnt; i++)
2045 struct sfm_var *sv = &r->sfm_vars[i];
2046 union value *v = case_data_rw_idx (c, sv->case_index);
2048 if (sv->var_width == 0)
2050 if (!read_case_number (r, &v->f))
2055 uint8_t *s = value_str_rw (v, sv->var_width);
2056 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2058 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2067 if (r->case_cnt != -1)
2068 read_error (reader, r);
2073 /* Issues an error that R ends in a partial record. */
2075 partial_record (struct sfm_reader *r)
2077 sys_error (r, r->pos, _("File ends in partial case."));
2080 /* Issues an error that an unspecified error occurred SFM, and
2083 read_error (struct casereader *r, const struct sfm_reader *sfm)
2085 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2086 casereader_force_error (r);
2089 /* Reads a number from R and stores its value in *D.
2090 If R is compressed, reads a compressed number;
2091 otherwise, reads a number in the regular way.
2092 Returns true if successful, false if end of file is
2093 reached immediately. */
2095 read_case_number (struct sfm_reader *r, double *d)
2100 if (!try_read_bytes (r, number, sizeof number))
2102 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2106 return read_compressed_number (r, d);
2109 /* Reads LENGTH string bytes from R into S.
2110 Always reads a multiple of 8 bytes; if LENGTH is not a
2111 multiple of 8, then extra bytes are read and discarded without
2113 Reads compressed strings if S is compressed.
2114 Returns true if successful, false if end of file is
2115 reached immediately. */
2117 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2119 size_t whole = ROUND_DOWN (length, 8);
2120 size_t partial = length % 8;
2124 if (!read_whole_strings (r, s, whole))
2131 if (!read_whole_strings (r, bounce, sizeof bounce))
2137 memcpy (s + whole, bounce, partial);
2143 /* Reads and returns the next compression opcode from R. */
2145 read_opcode (struct sfm_reader *r)
2147 assert (r->compressed);
2151 if (r->opcode_idx >= sizeof r->opcodes)
2153 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2157 opcode = r->opcodes[r->opcode_idx++];
2164 /* Reads a compressed number from R and stores its value in D.
2165 Returns true if successful, false if end of file is
2166 reached immediately. */
2168 read_compressed_number (struct sfm_reader *r, double *d)
2170 int opcode = read_opcode (r);
2178 *d = read_float (r);
2182 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2183 if (!r->corruption_warning)
2185 r->corruption_warning = true;
2186 sys_warn (r, r->pos,
2187 _("Possible compressed data corruption: "
2188 "compressed spaces appear in numeric field."));
2197 *d = opcode - r->bias;
2204 /* Reads a compressed 8-byte string segment from R and stores it
2206 Returns true if successful, false if end of file is
2207 reached immediately. */
2209 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2211 int opcode = read_opcode (r);
2219 read_bytes (r, dst, 8);
2223 memset (dst, ' ', 8);
2228 double value = opcode - r->bias;
2229 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2232 /* This has actually been seen "in the wild". The submitter of the
2233 file that showed that the contents decoded as spaces, but they
2234 were at the end of the field so it's possible that the null
2235 bytes just acted as null terminators. */
2237 else if (!r->corruption_warning)
2239 r->corruption_warning = true;
2240 sys_warn (r, r->pos,
2241 _("Possible compressed data corruption: "
2242 "string contains compressed integer (opcode %d)."),
2252 /* Reads LENGTH string bytes from R into S.
2253 LENGTH must be a multiple of 8.
2254 Reads compressed strings if S is compressed.
2255 Returns true if successful, false if end of file is
2256 reached immediately. */
2258 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2260 assert (length % 8 == 0);
2262 return try_read_bytes (r, s, length);
2266 for (ofs = 0; ofs < length; ofs += 8)
2267 if (!read_compressed_string (r, s + ofs))
2277 /* Skips LENGTH string bytes from R.
2278 LENGTH must be a multiple of 8.
2279 (LENGTH is also limited to 1024, but that's only because the
2280 current caller never needs more than that many bytes.)
2281 Returns true if successful, false if end of file is
2282 reached immediately. */
2284 skip_whole_strings (struct sfm_reader *r, size_t length)
2286 uint8_t buffer[1024];
2287 assert (length < sizeof buffer);
2288 return read_whole_strings (r, buffer, length);
2291 /* Helpers for reading records that contain structured text
2294 /* Maximum number of warnings to issue for a single text
2296 #define MAX_TEXT_WARNINGS 5
2301 struct substring buffer; /* Record contents. */
2302 off_t start; /* Starting offset in file. */
2303 size_t pos; /* Current position in buffer. */
2304 int n_warnings; /* Number of warnings issued or suppressed. */
2305 bool recoded; /* Recoded into UTF-8? */
2308 static struct text_record *
2309 open_text_record (struct sfm_reader *r,
2310 const struct sfm_extension_record *record,
2311 bool recode_to_utf8)
2313 struct text_record *text;
2314 struct substring raw;
2316 text = pool_alloc (r->pool, sizeof *text);
2317 raw = ss_buffer (record->data, record->size * record->count);
2318 text->start = record->pos;
2319 text->buffer = (recode_to_utf8
2320 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2323 text->n_warnings = 0;
2324 text->recoded = recode_to_utf8;
2329 /* Closes TEXT, frees its storage, and issues a final warning
2330 about suppressed warnings if necesary. */
2332 close_text_record (struct sfm_reader *r, struct text_record *text)
2334 if (text->n_warnings > MAX_TEXT_WARNINGS)
2335 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2336 text->n_warnings - MAX_TEXT_WARNINGS);
2338 pool_free (r->pool, ss_data (text->buffer));
2341 /* Reads a variable=value pair from TEXT.
2342 Looks up the variable in DICT and stores it into *VAR.
2343 Stores a null-terminated value into *VALUE. */
2345 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2346 struct text_record *text,
2347 struct variable **var, char **value)
2351 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2354 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2358 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2359 ss_buffer ("\t\0", 2));
2367 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2368 struct text_record *text, struct substring delimiters,
2369 struct variable **var)
2373 name = text_get_token (text, delimiters, NULL);
2377 *var = dict_lookup_var (dict, name);
2381 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2388 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2389 struct text_record *text, struct substring delimiters,
2390 struct variable **var)
2392 char *short_name = text_get_token (text, delimiters, NULL);
2393 if (short_name == NULL)
2396 *var = dict_lookup_var (dict, short_name);
2398 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2403 /* Displays a warning for the current file position, limiting the
2404 number to MAX_TEXT_WARNINGS for TEXT. */
2406 text_warn (struct sfm_reader *r, struct text_record *text,
2407 const char *format, ...)
2409 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2413 va_start (args, format);
2414 sys_msg (r, text->start + text->pos, MW, format, args);
2420 text_get_token (struct text_record *text, struct substring delimiters,
2423 struct substring token;
2426 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2429 end = &ss_data (token)[ss_length (token)];
2430 if (delimiter != NULL)
2433 return ss_data (token);
2436 /* Reads a integer value expressed in decimal, then a space, then a string that
2437 consists of exactly as many bytes as specified by the integer, then a space,
2438 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2439 buffer (so the caller should not free the string). */
2441 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2449 while (text->pos < text->buffer.length)
2451 int c = text->buffer.string[text->pos];
2452 if (c < '0' || c > '9')
2454 n = (n * 10) + (c - '0');
2457 if (text->pos >= text->buffer.length || start == text->pos)
2459 sys_warn (r, text->start,
2460 _("Expecting digit at offset %zu in MRSETS record."),
2465 if (!text_match (text, ' '))
2467 sys_warn (r, text->start,
2468 _("Expecting space at offset %zu in MRSETS record."),
2473 if (text->pos + n > text->buffer.length)
2475 sys_warn (r, text->start,
2476 _("%zu-byte string starting at offset %zu "
2477 "exceeds record length %zu."),
2478 n, text->pos, text->buffer.length);
2482 s = &text->buffer.string[text->pos];
2485 sys_warn (r, text->start,
2486 _("Expecting space at offset %zu following %zu-byte string."),
2496 text_match (struct text_record *text, char c)
2498 if (text->buffer.string[text->pos] == c)
2507 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2508 inside the TEXT's string. */
2510 text_pos (const struct text_record *text)
2517 /* Displays a corruption message. */
2519 sys_msg (struct sfm_reader *r, off_t offset,
2520 int class, const char *format, va_list args)
2525 ds_init_empty (&text);
2527 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2528 fh_get_file_name (r->fh), (long long int) offset);
2530 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2531 ds_put_vformat (&text, format, args);
2533 m.category = msg_class_to_category (class);
2534 m.severity = msg_class_to_severity (class);
2540 m.text = ds_cstr (&text);
2545 /* Displays a warning for offset OFFSET in the file. */
2547 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2551 va_start (args, format);
2552 sys_msg (r, offset, MW, format, args);
2556 /* Displays an error for the current file position,
2557 marks it as in an error state,
2558 and aborts reading it using longjmp. */
2560 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2564 va_start (args, format);
2565 sys_msg (r, offset, ME, format, args);
2569 longjmp (r->bail_out, 1);
2572 /* Reads BYTE_CNT bytes into BUF.
2573 Returns true if exactly BYTE_CNT bytes are successfully read.
2574 Aborts if an I/O error or a partial read occurs.
2575 If EOF_IS_OK, then an immediate end-of-file causes false to be
2576 returned; otherwise, immediate end-of-file causes an abort
2579 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2580 void *buf, size_t byte_cnt)
2582 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2583 r->pos += bytes_read;
2584 if (bytes_read == byte_cnt)
2586 else if (ferror (r->file))
2587 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2588 else if (!eof_is_ok || bytes_read != 0)
2589 sys_error (r, r->pos, _("Unexpected end of file."));
2594 /* Reads BYTE_CNT into BUF.
2595 Aborts upon I/O error or if end-of-file is encountered. */
2597 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2599 read_bytes_internal (r, false, buf, byte_cnt);
2602 /* Reads BYTE_CNT bytes into BUF.
2603 Returns true if exactly BYTE_CNT bytes are successfully read.
2604 Returns false if an immediate end-of-file is encountered.
2605 Aborts if an I/O error or a partial read occurs. */
2607 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2609 return read_bytes_internal (r, true, buf, byte_cnt);
2612 /* Reads a 32-bit signed integer from R and returns its value in
2615 read_int (struct sfm_reader *r)
2618 read_bytes (r, integer, sizeof integer);
2619 return integer_get (r->integer_format, integer, sizeof integer);
2622 /* Reads a 64-bit floating-point number from R and returns its
2623 value in host format. */
2625 read_float (struct sfm_reader *r)
2628 read_bytes (r, number, sizeof number);
2629 return float_get_double (r->float_format, number);
2633 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2635 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2639 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2641 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2644 /* Reads exactly SIZE - 1 bytes into BUFFER
2645 and stores a null byte into BUFFER[SIZE - 1]. */
2647 read_string (struct sfm_reader *r, char *buffer, size_t size)
2650 read_bytes (r, buffer, size - 1);
2651 buffer[size - 1] = '\0';
2654 /* Skips BYTES bytes forward in R. */
2656 skip_bytes (struct sfm_reader *r, size_t bytes)
2661 size_t chunk = MIN (sizeof buffer, bytes);
2662 read_bytes (r, buffer, chunk);
2667 static const struct casereader_class sys_file_casereader_class =
2669 sys_file_casereader_read,
2670 sys_file_casereader_destroy,