1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
88 /* Fields from the top-level header record. */
89 struct sfm_header_record
91 char magic[5]; /* First 4 bytes of file, then null. */
92 int weight_idx; /* 0 if unweighted, otherwise a var index. */
93 int nominal_case_size; /* Number of var positions. */
95 /* These correspond to the members of struct sfm_file_info or a dictionary
96 but in the system file's encoding rather than ASCII. */
97 char creation_date[10]; /* "dd mmm yy". */
98 char creation_time[9]; /* "hh:mm:ss". */
99 char eye_catcher[61]; /* Eye-catcher string, then product name. */
100 char file_label[65]; /* File label. */
103 struct sfm_var_record
110 int missing_value_code;
113 struct variable *var;
116 struct sfm_value_label
122 struct sfm_value_label_record
125 struct sfm_value_label *labels;
132 struct sfm_document_record
139 struct sfm_extension_record
141 off_t pos; /* Starting offset in file. */
142 size_t size; /* Size of data elements. */
143 size_t count; /* Number of data elements. */
144 void *data; /* Contents. */
147 /* System file reader. */
150 /* Resource tracking. */
151 struct pool *pool; /* All system file state. */
152 jmp_buf bail_out; /* longjmp() target for error handling. */
155 struct file_handle *fh; /* File handle. */
156 struct fh_lock *lock; /* Mutual exclusion for file handle. */
157 FILE *file; /* File stream. */
158 off_t pos; /* Position in file. */
159 bool error; /* I/O or corruption error? */
160 struct caseproto *proto; /* Format of output cases. */
163 enum integer_format integer_format; /* On-disk integer format. */
164 enum float_format float_format; /* On-disk floating point format. */
165 struct sfm_var *sfm_vars; /* Variables. */
166 size_t sfm_var_cnt; /* Number of variables. */
167 casenumber case_cnt; /* Number of cases */
168 const char *encoding; /* String encoding. */
171 bool compressed; /* File is compressed? */
172 double bias; /* Compression bias, usually 100.0. */
173 uint8_t opcodes[8]; /* Current block of opcodes. */
174 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
175 bool corruption_warning; /* Warned about possible corruption? */
178 static const struct casereader_class sys_file_casereader_class;
180 static bool close_reader (struct sfm_reader *);
182 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
183 const struct sfm_var_record *,
186 static void sys_msg (struct sfm_reader *r, off_t, int class,
187 const char *format, va_list args)
188 PRINTF_FORMAT (4, 0);
189 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
190 PRINTF_FORMAT (3, 4);
191 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
195 static void read_bytes (struct sfm_reader *, void *, size_t);
196 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
197 static int read_int (struct sfm_reader *);
198 static double read_float (struct sfm_reader *);
199 static void read_string (struct sfm_reader *, char *, size_t);
200 static void skip_bytes (struct sfm_reader *, size_t);
202 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
203 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
205 static void read_variable_record (struct sfm_reader *,
206 struct sfm_var_record *);
207 static void read_value_label_record (struct sfm_reader *,
208 struct sfm_value_label_record *,
210 static struct sfm_document_record *read_document_record (struct sfm_reader *);
211 static struct sfm_extension_record *read_extension_record (
212 struct sfm_reader *, int subtype);
213 static void skip_extension_record (struct sfm_reader *, int subtype);
215 static const char *choose_encoding (
217 const struct sfm_header_record *,
218 const struct sfm_extension_record *ext_integer,
219 const struct sfm_extension_record *ext_encoding);
221 static struct text_record *open_text_record (
222 struct sfm_reader *, const struct sfm_extension_record *,
223 bool recode_to_utf8);
224 static void close_text_record (struct sfm_reader *,
225 struct text_record *);
226 static bool read_variable_to_value_pair (struct sfm_reader *,
228 struct text_record *,
229 struct variable **var, char **value);
230 static void text_warn (struct sfm_reader *r, struct text_record *text,
231 const char *format, ...)
232 PRINTF_FORMAT (3, 4);
233 static char *text_get_token (struct text_record *,
234 struct substring delimiters, char *delimiter);
235 static bool text_match (struct text_record *, char c);
236 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
237 struct text_record *,
238 struct substring delimiters,
240 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
241 struct text_record *,
242 struct substring delimiters,
244 static const char *text_parse_counted_string (struct sfm_reader *,
245 struct text_record *);
246 static size_t text_pos (const struct text_record *);
248 static bool close_reader (struct sfm_reader *r);
250 /* Dictionary reader. */
258 static void read_header (struct sfm_reader *, struct sfm_read_info *,
259 struct sfm_header_record *);
260 static void parse_header (struct sfm_reader *,
261 const struct sfm_header_record *,
262 struct sfm_read_info *, struct dictionary *);
263 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
264 struct sfm_var_record *, size_t n);
265 static void parse_format_spec (struct sfm_reader *, off_t pos,
266 unsigned int format, enum which_format,
267 struct variable *, int *format_warning_cnt);
268 static void parse_document (struct dictionary *, struct sfm_document_record *);
269 static void parse_display_parameters (struct sfm_reader *,
270 const struct sfm_extension_record *,
271 struct dictionary *);
272 static void parse_machine_integer_info (struct sfm_reader *,
273 const struct sfm_extension_record *,
274 struct sfm_read_info *);
275 static void parse_machine_float_info (struct sfm_reader *,
276 const struct sfm_extension_record *);
277 static void parse_mrsets (struct sfm_reader *,
278 const struct sfm_extension_record *,
279 struct dictionary *);
280 static void parse_long_var_name_map (struct sfm_reader *,
281 const struct sfm_extension_record *,
282 struct dictionary *);
283 static void parse_long_string_map (struct sfm_reader *,
284 const struct sfm_extension_record *,
285 struct dictionary *);
286 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
287 const struct sfm_var_record *,
289 const struct sfm_value_label_record *);
290 static void parse_data_file_attributes (struct sfm_reader *,
291 const struct sfm_extension_record *,
292 struct dictionary *);
293 static void parse_variable_attributes (struct sfm_reader *,
294 const struct sfm_extension_record *,
295 struct dictionary *);
296 static void parse_long_string_value_labels (struct sfm_reader *,
297 const struct sfm_extension_record *,
298 struct dictionary *);
300 /* Frees the strings inside INFO. */
302 sfm_read_info_destroy (struct sfm_read_info *info)
306 free (info->creation_date);
307 free (info->creation_time);
308 free (info->product);
312 /* Opens the system file designated by file handle FH for reading. Reads the
313 system file's dictionary into *DICT.
315 Ordinarily the reader attempts to automatically detect the character
316 encoding based on the file's contents. This isn't always possible,
317 especially for files written by old versions of SPSS or PSPP, so specifying
318 a nonnull ENCODING overrides the choice of character encoding.
320 If INFO is non-null, then it receives additional info about the system file,
321 which the caller must eventually free with sfm_read_info_destroy() when it
322 is no longer needed. */
324 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
325 struct dictionary **dictp, struct sfm_read_info *infop)
327 struct sfm_reader *volatile r = NULL;
328 struct sfm_read_info info;
330 struct sfm_header_record header;
332 struct sfm_var_record *vars;
333 size_t n_vars, allocated_vars;
335 struct sfm_value_label_record *labels;
336 size_t n_labels, allocated_labels;
338 struct sfm_document_record *document;
340 struct sfm_extension_record *extensions[32];
342 struct dictionary *dict = NULL;
345 /* Create and initialize reader. */
346 r = pool_create_container (struct sfm_reader, pool);
352 r->opcode_idx = sizeof r->opcodes;
353 r->corruption_warning = false;
355 memset (&info, 0, sizeof info);
357 /* TRANSLATORS: this fragment will be interpolated into
358 messages in fh_lock() that identify types of files. */
359 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
363 r->file = fn_open (fh_get_file_name (fh), "rb");
366 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
367 fh_get_file_name (r->fh), strerror (errno));
371 if (setjmp (r->bail_out))
375 read_header (r, &info, &header);
378 n_vars = allocated_vars = 0;
381 n_labels = allocated_labels = 0;
385 memset (extensions, 0, sizeof extensions);
395 read_int (r); /* Skip filler. */
402 if (n_vars >= allocated_vars)
403 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
405 read_variable_record (r, &vars[n_vars++]);
409 if (n_labels >= allocated_labels)
410 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
412 read_value_label_record (r, &labels[n_labels++], n_vars);
416 /* A Type 4 record is always immediately after a type 3 record,
417 so the code for type 3 records reads the type 4 record too. */
418 sys_error (r, r->pos, _("Misplaced type 4 record."));
421 if (document != NULL)
422 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
423 document = read_document_record (r);
427 subtype = read_int (r);
428 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
431 _("Unrecognized record type 7, subtype %d. Please "
432 "send a copy of this file, and the syntax which "
433 "created it to %s."),
434 subtype, PACKAGE_BUGREPORT);
435 skip_extension_record (r, subtype);
437 else if (extensions[subtype] != NULL)
440 _("Record type 7, subtype %d found here has the same "
441 "type as the record found near offset 0x%llx. "
442 "Please send a copy of this file, and the syntax "
443 "which created it to %s."),
444 subtype, (long long int) extensions[subtype]->pos,
446 skip_extension_record (r, subtype);
449 extensions[subtype] = read_extension_record (r, subtype);
453 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
458 /* Now actually parse what we read.
460 First, figure out the correct character encoding, because this determines
461 how the rest of the header data is to be interpreted. */
462 dict = dict_create (encoding
464 : choose_encoding (r, &header, extensions[EXT_INTEGER],
465 extensions[EXT_ENCODING]));
466 r->encoding = dict_get_encoding (dict);
468 /* These records don't use variables at all. */
469 if (document != NULL)
470 parse_document (dict, document);
472 if (extensions[EXT_INTEGER] != NULL)
473 parse_machine_integer_info (r, extensions[EXT_INTEGER], &info);
475 if (extensions[EXT_FLOAT] != NULL)
476 parse_machine_float_info (r, extensions[EXT_FLOAT]);
478 if (extensions[EXT_FILE_ATTRS] != NULL)
479 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
481 parse_header (r, &header, &info, dict);
483 /* Parse the variable records, the basis of almost everything else. */
484 parse_variable_records (r, dict, vars, n_vars);
486 /* Parse value labels and the weight variable immediately after the variable
487 records. These records use indexes into var_recs[], so we must parse them
488 before those indexes become invalidated by very long string variables. */
489 for (i = 0; i < n_labels; i++)
490 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
491 if (header.weight_idx != 0)
493 struct variable *weight_var;
495 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
497 if (var_is_numeric (weight_var))
498 dict_set_weight (dict, weight_var);
500 sys_error (r, -1, _("Weighting variable must be numeric "
501 "(not string variable `%s')."),
502 var_get_name (weight_var));
505 if (extensions[EXT_DISPLAY] != NULL)
506 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
508 /* The following records use short names, so they need to be parsed before
509 parse_long_var_name_map() changes short names to long names. */
510 if (extensions[EXT_MRSETS] != NULL)
511 parse_mrsets (r, extensions[EXT_MRSETS], dict);
513 if (extensions[EXT_MRSETS2] != NULL)
514 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
516 if (extensions[EXT_LONG_STRINGS] != NULL)
517 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
519 /* Now rename variables to their long names. */
520 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
522 /* The following records use long names, so they need to follow renaming. */
523 if (extensions[EXT_VAR_ATTRS] != NULL)
524 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
526 if (extensions[EXT_LONG_LABELS] != NULL)
527 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
529 /* Warn if the actual amount of data per case differs from the
530 amount that the header claims. SPSS version 13 gets this
531 wrong when very long strings are involved, so don't warn in
533 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
534 && info.version_major != 13)
535 sys_warn (r, -1, _("File header claims %d variable positions but "
536 "%zu were read from file."),
537 header.nominal_case_size, n_vars);
539 /* Create an index of dictionary variable widths for
540 sfm_read_case to use. We cannot use the `struct variable's
541 from the dictionary we created, because the caller owns the
542 dictionary and may destroy or modify its variables. */
543 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
544 pool_register (r->pool, free, r->sfm_vars);
545 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
551 sfm_read_info_destroy (&info);
553 return casereader_create_sequential
555 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
556 &sys_file_casereader_class, r);
559 sfm_read_info_destroy (&info);
566 /* Closes a system file after we're done with it.
567 Returns true if an I/O error has occurred on READER, false
570 close_reader (struct sfm_reader *r)
579 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
581 msg (ME, _("Error closing system file `%s': %s."),
582 fh_get_file_name (r->fh), strerror (errno));
592 pool_destroy (r->pool);
597 /* Destroys READER. */
599 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
601 struct sfm_reader *r = r_;
605 /* Returns true if FILE is an SPSS system file,
608 sfm_detect (FILE *file)
612 if (fread (magic, 4, 1, file) != 1)
616 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
619 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
620 except for the string fields in *INFO, which parse_header() will initialize
621 later once the file's encoding is known. */
623 read_header (struct sfm_reader *r, struct sfm_read_info *info,
624 struct sfm_header_record *header)
626 uint8_t raw_layout_code[4];
629 read_string (r, header->magic, sizeof header->magic);
630 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
632 if (strcmp (ASCII_MAGIC, header->magic)
633 && strcmp (EBCDIC_MAGIC, header->magic))
634 sys_error (r, 0, _("This is not an SPSS system file."));
636 /* Identify integer format. */
637 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
638 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
640 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
642 || (r->integer_format != INTEGER_MSB_FIRST
643 && r->integer_format != INTEGER_LSB_FIRST))
644 sys_error (r, 64, _("This is not an SPSS system file."));
646 header->nominal_case_size = read_int (r);
647 if (header->nominal_case_size < 0
648 || header->nominal_case_size > INT_MAX / 16)
649 header->nominal_case_size = -1;
651 r->compressed = read_int (r) != 0;
653 header->weight_idx = read_int (r);
655 r->case_cnt = read_int (r);
656 if ( r->case_cnt > INT_MAX / 2)
659 /* Identify floating-point format and obtain compression bias. */
660 read_bytes (r, raw_bias, sizeof raw_bias);
661 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
663 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
665 if (memcmp (raw_bias, zero_bias, 8))
666 sys_warn (r, r->pos - 8,
667 _("Compression bias is not the usual "
668 "value of 100, or system file uses unrecognized "
669 "floating-point format."));
672 /* Some software is known to write all-zeros to this
673 field. Such software also writes floating-point
674 numbers in the format that we expect by default
675 (it seems that all software most likely does, in
676 reality), so don't warn in this case. */
679 if (r->integer_format == INTEGER_MSB_FIRST)
680 r->float_format = FLOAT_IEEE_DOUBLE_BE;
682 r->float_format = FLOAT_IEEE_DOUBLE_LE;
684 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
686 read_string (r, header->creation_date, sizeof header->creation_date);
687 read_string (r, header->creation_time, sizeof header->creation_time);
688 read_string (r, header->file_label, sizeof header->file_label);
691 info->integer_format = r->integer_format;
692 info->float_format = r->float_format;
693 info->compressed = r->compressed;
694 info->case_cnt = r->case_cnt;
697 /* Reads a variable (type 2) record from R into RECORD. */
699 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
701 int has_variable_label;
703 memset (record, 0, sizeof *record);
705 record->pos = r->pos;
706 record->width = read_int (r);
707 has_variable_label = read_int (r);
708 record->missing_value_code = read_int (r);
709 record->print_format = read_int (r);
710 record->write_format = read_int (r);
711 read_bytes (r, record->name, sizeof record->name);
713 if (has_variable_label == 1)
715 enum { MAX_LABEL_LEN = 255 };
716 size_t len, read_len;
720 /* Read up to MAX_LABEL_LEN bytes of label. */
721 read_len = MIN (MAX_LABEL_LEN, len);
722 record->label = pool_malloc (r->pool, read_len + 1);
723 read_string (r, record->label, read_len + 1);
725 /* Skip unread label bytes. */
726 skip_bytes (r, len - read_len);
728 /* Skip label padding up to multiple of 4 bytes. */
729 skip_bytes (r, ROUND_UP (len, 4) - len);
731 else if (has_variable_label != 0)
732 sys_error (r, record->pos,
733 _("Variable label indicator field is not 0 or 1."));
735 /* Set missing values. */
736 if (record->missing_value_code != 0)
738 int code = record->missing_value_code;
739 if (record->width == 0)
741 if (code < -3 || code > 3 || code == -1)
742 sys_error (r, record->pos,
743 _("Numeric missing value indicator field is not "
744 "-3, -2, 0, 1, 2, or 3."));
748 if (code < 1 || code > 3)
749 sys_error (r, record->pos,
750 _("String missing value indicator field is not "
754 read_bytes (r, record->missing, 8 * abs (code));
758 /* Reads value labels from R into RECORD. */
760 read_value_label_record (struct sfm_reader *r,
761 struct sfm_value_label_record *record,
766 /* Read type 3 record. */
767 record->pos = r->pos;
768 record->n_labels = read_int (r);
769 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
770 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
772 record->labels = pool_nmalloc (r->pool, record->n_labels,
773 sizeof *record->labels);
774 for (i = 0; i < record->n_labels; i++)
776 struct sfm_value_label *label = &record->labels[i];
777 unsigned char label_len;
780 read_bytes (r, label->value, sizeof label->value);
782 /* Read label length. */
783 read_bytes (r, &label_len, sizeof label_len);
784 padded_len = ROUND_UP (label_len + 1, 8);
786 /* Read label, padding. */
787 label->label = pool_malloc (r->pool, padded_len + 1);
788 read_bytes (r, label->label, padded_len - 1);
789 label->label[label_len] = '\0';
792 /* Read record type of type 4 record. */
793 if (read_int (r) != 4)
794 sys_error (r, r->pos - 4,
795 _("Variable index record (type 4) does not immediately "
796 "follow value label record (type 3) as it should."));
798 /* Read number of variables associated with value label from type 4
800 record->n_vars = read_int (r);
801 if (record->n_vars < 1 || record->n_vars > n_vars)
802 sys_error (r, r->pos - 4,
803 _("Number of variables associated with a value label (%zu) "
804 "is not between 1 and the number of variables (%zu)."),
805 record->n_vars, n_vars);
806 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
807 for (i = 0; i < record->n_vars; i++)
808 record->vars[i] = read_int (r);
811 /* Reads a document record from R and returns it. */
812 static struct sfm_document_record *
813 read_document_record (struct sfm_reader *r)
815 struct sfm_document_record *record;
818 record = pool_malloc (r->pool, sizeof *record);
819 record->pos = r->pos;
821 n_lines = read_int (r);
822 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
823 sys_error (r, record->pos,
824 _("Number of document lines (%d) "
825 "must be greater than 0 and less than %d."),
826 n_lines, INT_MAX / DOC_LINE_LENGTH);
828 record->n_lines = n_lines;
829 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
830 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
836 read_extension_record_header (struct sfm_reader *r, int subtype,
837 struct sfm_extension_record *record)
839 record->pos = r->pos;
840 record->size = read_int (r);
841 record->count = read_int (r);
843 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
844 allows an extra byte for a null terminator, used by some
845 extension processing routines. */
846 if (record->size != 0
847 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
848 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
851 /* Reads an extension record from R into RECORD. */
852 static struct sfm_extension_record *
853 read_extension_record (struct sfm_reader *r, int subtype)
855 struct extension_record_type
862 static const struct extension_record_type types[] =
864 /* Implemented record types. */
865 { EXT_INTEGER, 4, 8 },
867 { EXT_MRSETS, 1, 0 },
868 { EXT_DISPLAY, 4, 0 },
869 { EXT_LONG_NAMES, 1, 0 },
870 { EXT_LONG_STRINGS, 1, 0 },
871 { EXT_NCASES, 8, 2 },
872 { EXT_FILE_ATTRS, 1, 0 },
873 { EXT_VAR_ATTRS, 1, 0 },
874 { EXT_MRSETS2, 1, 0 },
875 { EXT_ENCODING, 1, 0 },
876 { EXT_LONG_LABELS, 1, 0 },
878 /* Ignored record types. */
879 { EXT_VAR_SETS, 0, 0 },
881 { EXT_DATA_ENTRY, 0, 0 },
884 const struct extension_record_type *type;
885 struct sfm_extension_record *record;
888 record = pool_malloc (r->pool, sizeof *record);
889 read_extension_record_header (r, subtype, record);
890 n_bytes = record->count * record->size;
892 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
893 if (subtype == type->subtype)
895 if (type->size > 0 && record->size != type->size)
896 sys_warn (r, record->pos,
897 _("Record type 7, subtype %d has bad size %zu "
898 "(expected %d)."), subtype, record->size, type->size);
899 else if (type->count > 0 && record->count != type->count)
900 sys_warn (r, record->pos,
901 _("Record type 7, subtype %d has bad count %zu "
902 "(expected %d)."), subtype, record->count, type->count);
903 else if (type->count == 0 && type->size == 0)
905 /* Ignore this record. */
909 char *data = pool_malloc (r->pool, n_bytes + 1);
910 data[n_bytes] = '\0';
913 read_bytes (r, record->data, n_bytes);
920 sys_warn (r, record->pos,
921 _("Unrecognized record type 7, subtype %d. Please send a "
922 "copy of this file, and the syntax which created it to %s."),
923 subtype, PACKAGE_BUGREPORT);
926 skip_bytes (r, n_bytes);
931 skip_extension_record (struct sfm_reader *r, int subtype)
933 struct sfm_extension_record record;
935 read_extension_record_header (r, subtype, &record);
936 skip_bytes (r, record.count * record.size);
940 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
941 struct sfm_read_info *info, struct dictionary *dict)
943 const char *dict_encoding = dict_get_encoding (dict);
944 struct substring product;
945 struct substring label;
947 /* Convert file label to UTF-8 and put it into DICT. */
948 label = recode_substring_pool ("UTF-8", dict_encoding,
949 ss_cstr (header->file_label), r->pool);
950 ss_trim (&label, ss_cstr (" "));
951 label.string[label.length] = '\0';
952 dict_set_label (dict, label.string);
954 /* Put creation date and time in UTF-8 into INFO. */
955 info->creation_date = recode_string ("UTF-8", dict_encoding,
956 header->creation_date, -1);
957 info->creation_time = recode_string ("UTF-8", dict_encoding,
958 header->creation_time, -1);
960 /* Put product name into INFO, dropping eye-catcher string if present. */
961 product = recode_substring_pool ("UTF-8", dict_encoding,
962 ss_cstr (header->eye_catcher), r->pool);
963 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
964 ss_trim (&product, ss_cstr (" "));
965 info->product = ss_xstrdup (product);
968 /* Reads a variable (type 2) record from R and adds the
969 corresponding variable to DICT.
970 Also skips past additional variable records for long string
973 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
974 struct sfm_var_record *var_recs, size_t n_var_recs)
976 const char *dict_encoding = dict_get_encoding (dict);
977 struct sfm_var_record *rec;
980 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
982 struct variable *var;
987 name = recode_string_pool ("UTF-8", dict_encoding,
988 rec->name, 8, r->pool);
989 name[strcspn (name, " ")] = '\0';
991 if (!dict_id_is_valid (dict, name, false)
992 || name[0] == '$' || name[0] == '#')
993 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
995 if (rec->width < 0 || rec->width > 255)
996 sys_error (r, rec->pos,
997 _("Bad width %d for variable %s."), rec->width, name);
999 var = rec->var = dict_create_var (dict, name, rec->width);
1001 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
1003 /* Set the short name the same as the long name. */
1004 var_set_short_name (var, 0, name);
1006 /* Get variable label, if any. */
1011 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1012 rec->label, -1, r->pool);
1013 var_set_label (var, utf8_label, false);
1016 /* Set missing values. */
1017 if (rec->missing_value_code != 0)
1019 int width = var_get_width (var);
1020 struct missing_values mv;
1022 mv_init_pool (r->pool, &mv, width);
1023 if (var_is_numeric (var))
1025 bool has_range = rec->missing_value_code < 0;
1026 int n_discrete = (has_range
1027 ? rec->missing_value_code == -3
1028 : rec->missing_value_code);
1033 double low = parse_float (r, rec->missing, 0);
1034 double high = parse_float (r, rec->missing, 8);
1035 mv_add_range (&mv, low, high);
1039 for (i = 0; i < n_discrete; i++)
1041 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1049 value_init_pool (r->pool, &value, width);
1050 value_set_missing (&value, width);
1051 for (i = 0; i < rec->missing_value_code; i++)
1053 uint8_t *s = value_str_rw (&value, width);
1054 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1055 mv_add_str (&mv, s);
1058 var_set_missing_values (var, &mv);
1062 parse_format_spec (r, rec->pos + 12, rec->print_format,
1063 PRINT_FORMAT, var, &n_warnings);
1064 parse_format_spec (r, rec->pos + 16, rec->write_format,
1065 WRITE_FORMAT, var, &n_warnings);
1067 /* Account for values.
1068 Skip long string continuation records, if any. */
1069 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1070 for (i = 1; i < n_values; i++)
1071 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1072 sys_error (r, rec->pos, _("Missing string continuation record."));
1077 /* Translates the format spec from sysfile format to internal
1080 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1081 enum which_format which, struct variable *v,
1084 const int max_warnings = 8;
1085 uint8_t raw_type = format >> 16;
1086 uint8_t w = format >> 8;
1095 ok = (fmt_from_io (raw_type, &f.type)
1096 && fmt_check_output (&f)
1097 && fmt_check_width_compat (&f, var_get_width (v)));
1102 if (which == PRINT_FORMAT)
1103 var_set_print_format (v, &f);
1105 var_set_write_format (v, &f);
1107 else if (format == 0)
1109 /* Actually observed in the wild. No point in warning about it. */
1111 else if (++*n_warnings <= max_warnings)
1113 if (which == PRINT_FORMAT)
1114 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1116 var_get_name (v), var_get_width (v), format);
1118 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1120 var_get_name (v), var_get_width (v), format);
1122 if (*n_warnings == max_warnings)
1123 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1128 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1132 for (p = record->documents;
1133 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1134 p += DOC_LINE_LENGTH)
1136 struct substring line;
1138 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1139 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1140 ss_rtrim (&line, ss_cstr (" "));
1141 line.string[line.length] = '\0';
1143 dict_add_document_line (dict, line.string, false);
1149 /* Parses record type 7, subtype 3. */
1151 parse_machine_integer_info (struct sfm_reader *r,
1152 const struct sfm_extension_record *record,
1153 struct sfm_read_info *info)
1155 int float_representation, expected_float_format;
1156 int integer_representation, expected_integer_format;
1158 /* Save version info. */
1159 info->version_major = parse_int (r, record->data, 0);
1160 info->version_minor = parse_int (r, record->data, 4);
1161 info->version_revision = parse_int (r, record->data, 8);
1163 /* Check floating point format. */
1164 float_representation = parse_int (r, record->data, 16);
1165 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1166 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1167 expected_float_format = 1;
1168 else if (r->float_format == FLOAT_Z_LONG)
1169 expected_float_format = 2;
1170 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1171 expected_float_format = 3;
1174 if (float_representation != expected_float_format)
1175 sys_error (r, record->pos, _("Floating-point representation indicated by "
1176 "system file (%d) differs from expected (%d)."),
1177 float_representation, expected_float_format);
1179 /* Check integer format. */
1180 integer_representation = parse_int (r, record->data, 24);
1181 if (r->integer_format == INTEGER_MSB_FIRST)
1182 expected_integer_format = 1;
1183 else if (r->integer_format == INTEGER_LSB_FIRST)
1184 expected_integer_format = 2;
1187 if (integer_representation != expected_integer_format)
1188 sys_warn (r, record->pos,
1189 _("Integer format indicated by system file (%d) "
1190 "differs from expected (%d)."),
1191 integer_representation, expected_integer_format);
1196 choose_encoding (struct sfm_reader *r,
1197 const struct sfm_header_record *header,
1198 const struct sfm_extension_record *ext_integer,
1199 const struct sfm_extension_record *ext_encoding)
1201 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1204 return ext_encoding->data;
1206 /* But EXT_INTEGER is better than nothing as a fallback. */
1209 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1210 const char *encoding;
1219 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1220 respectively. However, there are known to be many files in the wild
1221 with character code 2, yet have data which are clearly not ASCII.
1222 Therefore we ignore these values. */
1229 encoding = sys_get_encoding_from_codepage (codepage);
1230 if (encoding != NULL)
1236 /* If the file magic number is EBCDIC then its character data is too. */
1237 if (!strcmp (header->magic, EBCDIC_MAGIC))
1240 return locale_charset ();
1243 /* Parses record type 7, subtype 4. */
1245 parse_machine_float_info (struct sfm_reader *r,
1246 const struct sfm_extension_record *record)
1248 double sysmis = parse_float (r, record->data, 0);
1249 double highest = parse_float (r, record->data, 8);
1250 double lowest = parse_float (r, record->data, 16);
1252 if (sysmis != SYSMIS)
1253 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1256 if (highest != HIGHEST)
1257 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1258 highest, "HIGHEST");
1260 if (lowest != LOWEST)
1261 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1265 /* Parses record type 7, subtype 7 or 19. */
1267 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1268 struct dictionary *dict)
1270 struct text_record *text;
1271 struct mrset *mrset;
1273 text = open_text_record (r, record, false);
1276 const char *counted = NULL;
1279 struct stringi_set var_names;
1280 size_t allocated_vars;
1284 mrset = xzalloc (sizeof *mrset);
1286 name = text_get_token (text, ss_cstr ("="), NULL);
1289 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1291 if (mrset->name[0] != '$')
1293 sys_warn (r, record->pos,
1294 _("`%s' does not begin with `$' at offset %zu "
1295 "in MRSETS record."), mrset->name, text_pos (text));
1299 if (text_match (text, 'C'))
1301 mrset->type = MRSET_MC;
1302 if (!text_match (text, ' '))
1304 sys_warn (r, record->pos,
1305 _("Missing space following `%c' at offset %zu "
1306 "in MRSETS record."), 'C', text_pos (text));
1310 else if (text_match (text, 'D'))
1312 mrset->type = MRSET_MD;
1313 mrset->cat_source = MRSET_VARLABELS;
1315 else if (text_match (text, 'E'))
1319 mrset->type = MRSET_MD;
1320 mrset->cat_source = MRSET_COUNTEDVALUES;
1321 if (!text_match (text, ' '))
1323 sys_warn (r, record->pos,
1324 _("Missing space following `%c' at offset %zu "
1325 "in MRSETS record."), 'E', text_pos (text));
1329 number = text_get_token (text, ss_cstr (" "), NULL);
1330 if (!strcmp (number, "11"))
1331 mrset->label_from_var_label = true;
1332 else if (strcmp (number, "1"))
1333 sys_warn (r, record->pos,
1334 _("Unexpected label source value `%s' following `E' "
1335 "at offset %zu in MRSETS record."),
1336 number, text_pos (text));
1340 sys_warn (r, record->pos,
1341 _("Missing `C', `D', or `E' at offset %zu "
1342 "in MRSETS record."),
1347 if (mrset->type == MRSET_MD)
1349 counted = text_parse_counted_string (r, text);
1350 if (counted == NULL)
1354 label = text_parse_counted_string (r, text);
1357 if (label[0] != '\0')
1358 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1360 stringi_set_init (&var_names);
1365 const char *raw_var_name;
1366 struct variable *var;
1369 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1370 if (raw_var_name == NULL)
1372 sys_warn (r, record->pos,
1373 _("Missing new-line parsing variable names "
1374 "at offset %zu in MRSETS record."),
1378 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1380 var = dict_lookup_var (dict, var_name);
1386 if (!stringi_set_insert (&var_names, var_name))
1388 sys_warn (r, record->pos,
1389 _("Duplicate variable name %s "
1390 "at offset %zu in MRSETS record."),
1391 var_name, text_pos (text));
1397 if (mrset->label == NULL && mrset->label_from_var_label
1398 && var_has_label (var))
1399 mrset->label = xstrdup (var_get_label (var));
1402 && var_get_type (var) != var_get_type (mrset->vars[0]))
1404 sys_warn (r, record->pos,
1405 _("MRSET %s contains both string and "
1406 "numeric variables."), name);
1409 width = MIN (width, var_get_width (var));
1411 if (mrset->n_vars >= allocated_vars)
1412 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1413 sizeof *mrset->vars);
1414 mrset->vars[mrset->n_vars++] = var;
1416 while (delimiter != '\n');
1418 if (mrset->n_vars < 2)
1420 sys_warn (r, record->pos,
1421 _("MRSET %s has only %zu variables."), mrset->name,
1423 mrset_destroy (mrset);
1427 if (mrset->type == MRSET_MD)
1429 mrset->width = width;
1430 value_init (&mrset->counted, width);
1432 mrset->counted.f = strtod (counted, NULL);
1434 value_copy_str_rpad (&mrset->counted, width,
1435 (const uint8_t *) counted, ' ');
1438 dict_add_mrset (dict, mrset);
1440 stringi_set_destroy (&var_names);
1442 mrset_destroy (mrset);
1443 close_text_record (r, text);
1446 /* Read record type 7, subtype 11, which specifies how variables
1447 should be displayed in GUI environments. */
1449 parse_display_parameters (struct sfm_reader *r,
1450 const struct sfm_extension_record *record,
1451 struct dictionary *dict)
1453 bool includes_width;
1454 bool warned = false;
1459 n_vars = dict_get_var_cnt (dict);
1460 if (record->count == 3 * n_vars)
1461 includes_width = true;
1462 else if (record->count == 2 * n_vars)
1463 includes_width = false;
1466 sys_warn (r, record->pos,
1467 _("Extension 11 has bad count %zu (for %zu variables)."),
1468 record->count, n_vars);
1473 for (i = 0; i < n_vars; ++i)
1475 struct variable *v = dict_get_var (dict, i);
1476 int measure, width, align;
1478 measure = parse_int (r, record->data, ofs);
1483 width = parse_int (r, record->data, ofs);
1489 align = parse_int (r, record->data, ofs);
1492 /* SPSS 14 sometimes seems to set string variables' measure
1494 if (0 == measure && var_is_alpha (v))
1497 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1500 sys_warn (r, record->pos,
1501 _("Invalid variable display parameters for variable "
1502 "%zu (%s). Default parameters substituted."),
1503 i, var_get_name (v));
1508 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1509 : measure == 2 ? MEASURE_ORDINAL
1511 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1512 : align == 1 ? ALIGN_RIGHT
1515 /* Older versions (SPSS 9.0) sometimes set the display
1516 width to zero. This causes confusion in the GUI, so
1517 only set the width if it is nonzero. */
1519 var_set_display_width (v, width);
1524 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1525 const char *new_name)
1527 size_t n_short_names;
1531 /* Renaming a variable may clear its short names, but we
1532 want to retain them, so we save them and re-set them
1534 n_short_names = var_get_short_name_cnt (var);
1535 short_names = xnmalloc (n_short_names, sizeof *short_names);
1536 for (i = 0; i < n_short_names; i++)
1538 const char *s = var_get_short_name (var, i);
1539 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1542 /* Set long name. */
1543 dict_rename_var (dict, var, new_name);
1545 /* Restore short names. */
1546 for (i = 0; i < n_short_names; i++)
1548 var_set_short_name (var, i, short_names[i]);
1549 free (short_names[i]);
1554 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1555 to each short name. Modifies variable names in DICT accordingly. */
1557 parse_long_var_name_map (struct sfm_reader *r,
1558 const struct sfm_extension_record *record,
1559 struct dictionary *dict)
1561 struct text_record *text;
1562 struct variable *var;
1567 /* Convert variable names to lowercase. */
1570 for (i = 0; i < dict_get_var_cnt (dict); i++)
1572 struct variable *var = dict_get_var (dict, i);
1575 new_name = xstrdup (var_get_name (var));
1576 str_lowercase (new_name);
1578 rename_var_and_save_short_names (dict, var, new_name);
1586 /* Rename each of the variables, one by one. (In a correctly constructed
1587 system file, this cannot create any intermediate duplicate variable names,
1588 because all of the new variable names are longer than any of the old
1589 variable names and thus there cannot be any overlaps.) */
1590 text = open_text_record (r, record, true);
1591 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1593 /* Validate long name. */
1594 /* XXX need to reencode name to UTF-8 */
1595 if (!dict_id_is_valid (dict, long_name, false))
1597 sys_warn (r, record->pos,
1598 _("Long variable mapping from %s to invalid "
1599 "variable name `%s'."),
1600 var_get_name (var), long_name);
1604 /* Identify any duplicates. */
1605 if (strcasecmp (var_get_short_name (var, 0), long_name)
1606 && dict_lookup_var (dict, long_name) != NULL)
1608 sys_warn (r, record->pos,
1609 _("Duplicate long variable name `%s'."), long_name);
1613 rename_var_and_save_short_names (dict, var, long_name);
1615 close_text_record (r, text);
1618 /* Reads record type 7, subtype 14, which gives the real length
1619 of each very long string. Rearranges DICT accordingly. */
1621 parse_long_string_map (struct sfm_reader *r,
1622 const struct sfm_extension_record *record,
1623 struct dictionary *dict)
1625 struct text_record *text;
1626 struct variable *var;
1629 text = open_text_record (r, record, true);
1630 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1632 size_t idx = var_get_dict_index (var);
1638 length = strtol (length_s, NULL, 10);
1639 if (length < 1 || length > MAX_STRING)
1641 sys_warn (r, record->pos,
1642 _("%s listed as string of invalid length %s "
1643 "in very long string record."),
1644 var_get_name (var), length_s);
1648 /* Check segments. */
1649 segment_cnt = sfm_width_to_segments (length);
1650 if (segment_cnt == 1)
1652 sys_warn (r, record->pos,
1653 _("%s listed in very long string record with width %s, "
1654 "which requires only one segment."),
1655 var_get_name (var), length_s);
1658 if (idx + segment_cnt > dict_get_var_cnt (dict))
1659 sys_error (r, record->pos,
1660 _("Very long string %s overflows dictionary."),
1661 var_get_name (var));
1663 /* Get the short names from the segments and check their
1665 for (i = 0; i < segment_cnt; i++)
1667 struct variable *seg = dict_get_var (dict, idx + i);
1668 int alloc_width = sfm_segment_alloc_width (length, i);
1669 int width = var_get_width (seg);
1672 var_set_short_name (var, i, var_get_short_name (seg, 0));
1673 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1674 sys_error (r, record->pos,
1675 _("Very long string with width %ld has segment %d "
1676 "of width %d (expected %d)."),
1677 length, i, width, alloc_width);
1679 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1680 var_set_width (var, length);
1682 close_text_record (r, text);
1683 dict_compact_values (dict);
1687 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1688 const struct sfm_var_record *var_recs, size_t n_var_recs,
1689 const struct sfm_value_label_record *record)
1691 struct variable **vars;
1695 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1696 for (i = 0; i < record->n_labels; i++)
1697 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1698 record->labels[i].label, -1,
1701 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1702 for (i = 0; i < record->n_vars; i++)
1703 vars[i] = lookup_var_by_index (r, record->pos,
1704 var_recs, n_var_recs, record->vars[i]);
1706 for (i = 1; i < record->n_vars; i++)
1707 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1708 sys_error (r, record->pos,
1709 _("Variables associated with value label are not all of "
1710 "identical type. Variable %s is %s, but variable "
1712 var_get_name (vars[0]),
1713 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1714 var_get_name (vars[i]),
1715 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1717 for (i = 0; i < record->n_vars; i++)
1719 struct variable *var = vars[i];
1723 width = var_get_width (var);
1725 sys_error (r, record->pos,
1726 _("Value labels may not be added to long string "
1727 "variables (e.g. %s) using records types 3 and 4."),
1728 var_get_name (var));
1730 for (j = 0; j < record->n_labels; j++)
1732 struct sfm_value_label *label = &record->labels[j];
1735 value_init (&value, width);
1737 value.f = parse_float (r, label->value, 0);
1739 memcpy (value_str_rw (&value, width), label->value, width);
1741 if (!var_add_value_label (var, &value, utf8_labels[j]))
1743 if (var_is_numeric (var))
1744 sys_warn (r, record->pos,
1745 _("Duplicate value label for %g on %s."),
1746 value.f, var_get_name (var));
1748 sys_warn (r, record->pos,
1749 _("Duplicate value label for `%.*s' on %s."),
1750 width, value_str (&value, width),
1751 var_get_name (var));
1754 value_destroy (&value, width);
1758 pool_free (r->pool, vars);
1759 for (i = 0; i < record->n_labels; i++)
1760 pool_free (r->pool, utf8_labels[i]);
1761 pool_free (r->pool, utf8_labels);
1764 static struct variable *
1765 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1766 const struct sfm_var_record *var_recs, size_t n_var_recs,
1769 const struct sfm_var_record *rec;
1771 if (idx < 1 || idx > n_var_recs)
1773 sys_error (r, offset,
1774 _("Variable index %d not in valid range 1...%zu."),
1779 rec = &var_recs[idx - 1];
1780 if (rec->var == NULL)
1782 sys_error (r, offset,
1783 _("Variable index %d refers to long string continuation."),
1791 /* Parses a set of custom attributes from TEXT into ATTRS.
1792 ATTRS may be a null pointer, in which case the attributes are
1793 read but discarded. */
1795 parse_attributes (struct sfm_reader *r, struct text_record *text,
1796 struct attrset *attrs)
1800 struct attribute *attr;
1804 /* Parse the key. */
1805 key = text_get_token (text, ss_cstr ("("), NULL);
1809 attr = attribute_create (key);
1810 for (index = 1; ; index++)
1812 /* Parse the value. */
1816 value = text_get_token (text, ss_cstr ("\n"), NULL);
1819 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1824 length = strlen (value);
1825 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1827 value[length - 1] = '\0';
1828 attribute_add_value (attr, value + 1);
1833 _("Attribute value %s[%d] is not quoted: %s."),
1835 attribute_add_value (attr, value);
1838 /* Was this the last value for this attribute? */
1839 if (text_match (text, ')'))
1843 attrset_add (attrs, attr);
1845 attribute_destroy (attr);
1847 while (!text_match (text, '/'));
1850 /* Reads record type 7, subtype 17, which lists custom
1851 attributes on the data file. */
1853 parse_data_file_attributes (struct sfm_reader *r,
1854 const struct sfm_extension_record *record,
1855 struct dictionary *dict)
1857 struct text_record *text = open_text_record (r, record, true);
1858 parse_attributes (r, text, dict_get_attributes (dict));
1859 close_text_record (r, text);
1862 /* Parses record type 7, subtype 18, which lists custom
1863 attributes on individual variables. */
1865 parse_variable_attributes (struct sfm_reader *r,
1866 const struct sfm_extension_record *record,
1867 struct dictionary *dict)
1869 struct text_record *text;
1870 struct variable *var;
1872 text = open_text_record (r, record, true);
1873 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1874 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1875 close_text_record (r, text);
1879 check_overflow (struct sfm_reader *r,
1880 const struct sfm_extension_record *record,
1881 size_t ofs, size_t length)
1883 size_t end = record->size * record->count;
1884 if (length >= end || ofs + length > end)
1885 sys_error (r, record->pos + end,
1886 _("Long string value label record ends unexpectedly."));
1890 parse_long_string_value_labels (struct sfm_reader *r,
1891 const struct sfm_extension_record *record,
1892 struct dictionary *dict)
1894 const char *dict_encoding = dict_get_encoding (dict);
1895 size_t end = record->size * record->count;
1902 struct variable *var;
1907 /* Parse variable name length. */
1908 check_overflow (r, record, ofs, 4);
1909 var_name_len = parse_int (r, record->data, ofs);
1912 /* Parse variable name, width, and number of labels. */
1913 check_overflow (r, record, ofs, var_name_len + 8);
1914 var_name = recode_string_pool ("UTF-8", dict_encoding,
1915 (const char *) record->data + ofs,
1916 var_name_len, r->pool);
1917 width = parse_int (r, record->data, ofs + var_name_len);
1918 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1919 ofs += var_name_len + 8;
1921 /* Look up 'var' and validate. */
1922 var = dict_lookup_var (dict, var_name);
1924 sys_warn (r, record->pos + ofs,
1925 _("Ignoring long string value record for "
1926 "unknown variable %s."), var_name);
1927 else if (var_is_numeric (var))
1929 sys_warn (r, record->pos + ofs,
1930 _("Ignoring long string value record for "
1931 "numeric variable %s."), var_name);
1934 else if (width != var_get_width (var))
1936 sys_warn (r, record->pos + ofs,
1937 _("Ignoring long string value record for variable %s "
1938 "because the record's width (%d) does not match the "
1939 "variable's width (%d)."),
1940 var_name, width, var_get_width (var));
1945 value_init_pool (r->pool, &value, width);
1946 for (i = 0; i < n_labels; i++)
1948 size_t value_length, label_length;
1949 bool skip = var == NULL;
1951 /* Parse value length. */
1952 check_overflow (r, record, ofs, 4);
1953 value_length = parse_int (r, record->data, ofs);
1957 check_overflow (r, record, ofs, value_length);
1960 if (value_length == width)
1961 memcpy (value_str_rw (&value, width),
1962 (const uint8_t *) record->data + ofs, width);
1965 sys_warn (r, record->pos + ofs,
1966 _("Ignoring long string value %zu for variable "
1967 "%s, with width %d, that has bad value "
1969 i, var_get_name (var), width, value_length);
1973 ofs += value_length;
1975 /* Parse label length. */
1976 check_overflow (r, record, ofs, 4);
1977 label_length = parse_int (r, record->data, ofs);
1981 check_overflow (r, record, ofs, label_length);
1986 label = recode_string_pool ("UTF-8", dict_encoding,
1987 (const char *) record->data + ofs,
1988 label_length, r->pool);
1989 if (!var_add_value_label (var, &value, label))
1990 sys_warn (r, record->pos + ofs,
1991 _("Duplicate value label for `%.*s' on %s."),
1992 width, value_str (&value, width),
1993 var_get_name (var));
1994 pool_free (r->pool, label);
1996 ofs += label_length;
2003 static void partial_record (struct sfm_reader *r)
2006 static void read_error (struct casereader *, const struct sfm_reader *);
2008 static bool read_case_number (struct sfm_reader *, double *);
2009 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2010 static int read_opcode (struct sfm_reader *);
2011 static bool read_compressed_number (struct sfm_reader *, double *);
2012 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2013 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2014 static bool skip_whole_strings (struct sfm_reader *, size_t);
2016 /* Reads and returns one case from READER's file. Returns a null
2017 pointer if not successful. */
2018 static struct ccase *
2019 sys_file_casereader_read (struct casereader *reader, void *r_)
2021 struct sfm_reader *r = r_;
2022 struct ccase *volatile c;
2028 c = case_create (r->proto);
2029 if (setjmp (r->bail_out))
2031 casereader_force_error (reader);
2036 for (i = 0; i < r->sfm_var_cnt; i++)
2038 struct sfm_var *sv = &r->sfm_vars[i];
2039 union value *v = case_data_rw_idx (c, sv->case_index);
2041 if (sv->var_width == 0)
2043 if (!read_case_number (r, &v->f))
2048 uint8_t *s = value_str_rw (v, sv->var_width);
2049 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2051 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2060 if (r->case_cnt != -1)
2061 read_error (reader, r);
2066 /* Issues an error that R ends in a partial record. */
2068 partial_record (struct sfm_reader *r)
2070 sys_error (r, r->pos, _("File ends in partial case."));
2073 /* Issues an error that an unspecified error occurred SFM, and
2076 read_error (struct casereader *r, const struct sfm_reader *sfm)
2078 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2079 casereader_force_error (r);
2082 /* Reads a number from R and stores its value in *D.
2083 If R is compressed, reads a compressed number;
2084 otherwise, reads a number in the regular way.
2085 Returns true if successful, false if end of file is
2086 reached immediately. */
2088 read_case_number (struct sfm_reader *r, double *d)
2093 if (!try_read_bytes (r, number, sizeof number))
2095 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2099 return read_compressed_number (r, d);
2102 /* Reads LENGTH string bytes from R into S.
2103 Always reads a multiple of 8 bytes; if LENGTH is not a
2104 multiple of 8, then extra bytes are read and discarded without
2106 Reads compressed strings if S is compressed.
2107 Returns true if successful, false if end of file is
2108 reached immediately. */
2110 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2112 size_t whole = ROUND_DOWN (length, 8);
2113 size_t partial = length % 8;
2117 if (!read_whole_strings (r, s, whole))
2124 if (!read_whole_strings (r, bounce, sizeof bounce))
2130 memcpy (s + whole, bounce, partial);
2136 /* Reads and returns the next compression opcode from R. */
2138 read_opcode (struct sfm_reader *r)
2140 assert (r->compressed);
2144 if (r->opcode_idx >= sizeof r->opcodes)
2146 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2150 opcode = r->opcodes[r->opcode_idx++];
2157 /* Reads a compressed number from R and stores its value in D.
2158 Returns true if successful, false if end of file is
2159 reached immediately. */
2161 read_compressed_number (struct sfm_reader *r, double *d)
2163 int opcode = read_opcode (r);
2171 *d = read_float (r);
2175 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2176 if (!r->corruption_warning)
2178 r->corruption_warning = true;
2179 sys_warn (r, r->pos,
2180 _("Possible compressed data corruption: "
2181 "compressed spaces appear in numeric field."));
2190 *d = opcode - r->bias;
2197 /* Reads a compressed 8-byte string segment from R and stores it
2199 Returns true if successful, false if end of file is
2200 reached immediately. */
2202 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2204 int opcode = read_opcode (r);
2212 read_bytes (r, dst, 8);
2216 memset (dst, ' ', 8);
2221 double value = opcode - r->bias;
2222 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2225 /* This has actually been seen "in the wild". The submitter of the
2226 file that showed that the contents decoded as spaces, but they
2227 were at the end of the field so it's possible that the null
2228 bytes just acted as null terminators. */
2230 else if (!r->corruption_warning)
2232 r->corruption_warning = true;
2233 sys_warn (r, r->pos,
2234 _("Possible compressed data corruption: "
2235 "string contains compressed integer (opcode %d)."),
2245 /* Reads LENGTH string bytes from R into S.
2246 LENGTH must be a multiple of 8.
2247 Reads compressed strings if S is compressed.
2248 Returns true if successful, false if end of file is
2249 reached immediately. */
2251 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2253 assert (length % 8 == 0);
2255 return try_read_bytes (r, s, length);
2259 for (ofs = 0; ofs < length; ofs += 8)
2260 if (!read_compressed_string (r, s + ofs))
2270 /* Skips LENGTH string bytes from R.
2271 LENGTH must be a multiple of 8.
2272 (LENGTH is also limited to 1024, but that's only because the
2273 current caller never needs more than that many bytes.)
2274 Returns true if successful, false if end of file is
2275 reached immediately. */
2277 skip_whole_strings (struct sfm_reader *r, size_t length)
2279 uint8_t buffer[1024];
2280 assert (length < sizeof buffer);
2281 return read_whole_strings (r, buffer, length);
2284 /* Helpers for reading records that contain structured text
2287 /* Maximum number of warnings to issue for a single text
2289 #define MAX_TEXT_WARNINGS 5
2294 struct substring buffer; /* Record contents. */
2295 off_t start; /* Starting offset in file. */
2296 size_t pos; /* Current position in buffer. */
2297 int n_warnings; /* Number of warnings issued or suppressed. */
2298 bool recoded; /* Recoded into UTF-8? */
2301 static struct text_record *
2302 open_text_record (struct sfm_reader *r,
2303 const struct sfm_extension_record *record,
2304 bool recode_to_utf8)
2306 struct text_record *text;
2307 struct substring raw;
2309 text = pool_alloc (r->pool, sizeof *text);
2310 raw = ss_buffer (record->data, record->size * record->count);
2311 text->start = record->pos;
2312 text->buffer = (recode_to_utf8
2313 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2316 text->n_warnings = 0;
2317 text->recoded = recode_to_utf8;
2322 /* Closes TEXT, frees its storage, and issues a final warning
2323 about suppressed warnings if necesary. */
2325 close_text_record (struct sfm_reader *r, struct text_record *text)
2327 if (text->n_warnings > MAX_TEXT_WARNINGS)
2328 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2329 text->n_warnings - MAX_TEXT_WARNINGS);
2331 pool_free (r->pool, ss_data (text->buffer));
2334 /* Reads a variable=value pair from TEXT.
2335 Looks up the variable in DICT and stores it into *VAR.
2336 Stores a null-terminated value into *VALUE. */
2338 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2339 struct text_record *text,
2340 struct variable **var, char **value)
2344 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2347 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2351 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2352 ss_buffer ("\t\0", 2));
2360 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2361 struct text_record *text, struct substring delimiters,
2362 struct variable **var)
2366 name = text_get_token (text, delimiters, NULL);
2370 *var = dict_lookup_var (dict, name);
2374 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2381 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2382 struct text_record *text, struct substring delimiters,
2383 struct variable **var)
2385 char *short_name = text_get_token (text, delimiters, NULL);
2386 if (short_name == NULL)
2389 *var = dict_lookup_var (dict, short_name);
2391 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2396 /* Displays a warning for the current file position, limiting the
2397 number to MAX_TEXT_WARNINGS for TEXT. */
2399 text_warn (struct sfm_reader *r, struct text_record *text,
2400 const char *format, ...)
2402 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2406 va_start (args, format);
2407 sys_msg (r, text->start + text->pos, MW, format, args);
2413 text_get_token (struct text_record *text, struct substring delimiters,
2416 struct substring token;
2419 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2422 end = &ss_data (token)[ss_length (token)];
2423 if (delimiter != NULL)
2426 return ss_data (token);
2429 /* Reads a integer value expressed in decimal, then a space, then a string that
2430 consists of exactly as many bytes as specified by the integer, then a space,
2431 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2432 buffer (so the caller should not free the string). */
2434 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2442 while (text->pos < text->buffer.length)
2444 int c = text->buffer.string[text->pos];
2445 if (c < '0' || c > '9')
2447 n = (n * 10) + (c - '0');
2450 if (text->pos >= text->buffer.length || start == text->pos)
2452 sys_warn (r, text->start,
2453 _("Expecting digit at offset %zu in MRSETS record."),
2458 if (!text_match (text, ' '))
2460 sys_warn (r, text->start,
2461 _("Expecting space at offset %zu in MRSETS record."),
2466 if (text->pos + n > text->buffer.length)
2468 sys_warn (r, text->start,
2469 _("%zu-byte string starting at offset %zu "
2470 "exceeds record length %zu."),
2471 n, text->pos, text->buffer.length);
2475 s = &text->buffer.string[text->pos];
2478 sys_warn (r, text->start,
2479 _("Expecting space at offset %zu following %zu-byte string."),
2489 text_match (struct text_record *text, char c)
2491 if (text->buffer.string[text->pos] == c)
2500 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2501 inside the TEXT's string. */
2503 text_pos (const struct text_record *text)
2510 /* Displays a corruption message. */
2512 sys_msg (struct sfm_reader *r, off_t offset,
2513 int class, const char *format, va_list args)
2518 ds_init_empty (&text);
2520 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2521 fh_get_file_name (r->fh), (long long int) offset);
2523 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2524 ds_put_vformat (&text, format, args);
2526 m.category = msg_class_to_category (class);
2527 m.severity = msg_class_to_severity (class);
2533 m.text = ds_cstr (&text);
2538 /* Displays a warning for offset OFFSET in the file. */
2540 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2544 va_start (args, format);
2545 sys_msg (r, offset, MW, format, args);
2549 /* Displays an error for the current file position,
2550 marks it as in an error state,
2551 and aborts reading it using longjmp. */
2553 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2557 va_start (args, format);
2558 sys_msg (r, offset, ME, format, args);
2562 longjmp (r->bail_out, 1);
2565 /* Reads BYTE_CNT bytes into BUF.
2566 Returns true if exactly BYTE_CNT bytes are successfully read.
2567 Aborts if an I/O error or a partial read occurs.
2568 If EOF_IS_OK, then an immediate end-of-file causes false to be
2569 returned; otherwise, immediate end-of-file causes an abort
2572 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2573 void *buf, size_t byte_cnt)
2575 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2576 r->pos += bytes_read;
2577 if (bytes_read == byte_cnt)
2579 else if (ferror (r->file))
2580 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2581 else if (!eof_is_ok || bytes_read != 0)
2582 sys_error (r, r->pos, _("Unexpected end of file."));
2587 /* Reads BYTE_CNT into BUF.
2588 Aborts upon I/O error or if end-of-file is encountered. */
2590 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2592 read_bytes_internal (r, false, buf, byte_cnt);
2595 /* Reads BYTE_CNT bytes into BUF.
2596 Returns true if exactly BYTE_CNT bytes are successfully read.
2597 Returns false if an immediate end-of-file is encountered.
2598 Aborts if an I/O error or a partial read occurs. */
2600 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2602 return read_bytes_internal (r, true, buf, byte_cnt);
2605 /* Reads a 32-bit signed integer from R and returns its value in
2608 read_int (struct sfm_reader *r)
2611 read_bytes (r, integer, sizeof integer);
2612 return integer_get (r->integer_format, integer, sizeof integer);
2615 /* Reads a 64-bit floating-point number from R and returns its
2616 value in host format. */
2618 read_float (struct sfm_reader *r)
2621 read_bytes (r, number, sizeof number);
2622 return float_get_double (r->float_format, number);
2626 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2628 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2632 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2634 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2637 /* Reads exactly SIZE - 1 bytes into BUFFER
2638 and stores a null byte into BUFFER[SIZE - 1]. */
2640 read_string (struct sfm_reader *r, char *buffer, size_t size)
2643 read_bytes (r, buffer, size - 1);
2644 buffer[size - 1] = '\0';
2647 /* Skips BYTES bytes forward in R. */
2649 skip_bytes (struct sfm_reader *r, size_t bytes)
2654 size_t chunk = MIN (sizeof buffer, bytes);
2655 read_bytes (r, buffer, chunk);
2660 static const struct casereader_class sys_file_casereader_class =
2662 sys_file_casereader_read,
2663 sys_file_casereader_destroy,