1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/assertion.h>
29 #include <libpspp/message.h>
30 #include <libpspp/compiler.h>
31 #include <libpspp/misc.h>
32 #include <libpspp/pool.h>
33 #include <libpspp/str.h>
34 #include <libpspp/hash.h>
35 #include <libpspp/array.h>
37 #include <data/case.h>
38 #include <data/casereader-provider.h>
39 #include <data/casereader.h>
40 #include <data/dictionary.h>
41 #include <data/file-handle-def.h>
42 #include <data/file-name.h>
43 #include <data/format.h>
44 #include <data/missing-values.h>
45 #include <data/short-names.h>
46 #include <data/value-labels.h>
47 #include <data/variable.h>
48 #include <data/value.h>
53 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 struct fh_lock *lock; /* Mutual exclusion for file handle. */
71 FILE *file; /* File stream. */
72 bool error; /* I/O or corruption error? */
73 size_t value_cnt; /* Number of "union value"s in struct case. */
76 enum integer_format integer_format; /* On-disk integer format. */
77 enum float_format float_format; /* On-disk floating point format. */
78 int oct_cnt; /* Number of 8-byte units per case. */
79 struct sfm_var *sfm_vars; /* Variables. */
80 size_t sfm_var_cnt; /* Number of variables. */
81 casenumber case_cnt; /* Number of cases */
82 bool has_long_var_names; /* File has a long variable name map */
85 bool compressed; /* File is compressed? */
86 double bias; /* Compression bias, usually 100.0. */
87 uint8_t opcodes[8]; /* Current block of opcodes. */
88 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 static const struct casereader_class sys_file_casereader_class;
93 static bool close_reader (struct sfm_reader *);
95 static struct variable **make_var_by_value_idx (struct sfm_reader *,
97 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
101 static void sys_warn (struct sfm_reader *, const char *, ...)
102 PRINTF_FORMAT (2, 3);
104 static void sys_error (struct sfm_reader *, const char *, ...)
108 static void read_bytes (struct sfm_reader *, void *, size_t);
109 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
110 static int read_int (struct sfm_reader *);
111 static double read_float (struct sfm_reader *);
112 static void read_string (struct sfm_reader *, char *, size_t);
113 static void skip_bytes (struct sfm_reader *, size_t);
115 static struct variable_to_value_map *open_variable_to_value_map (
116 struct sfm_reader *, size_t size);
117 static void close_variable_to_value_map (struct sfm_reader *r,
118 struct variable_to_value_map *);
119 static bool read_variable_to_value_map (struct sfm_reader *,
121 struct variable_to_value_map *,
122 struct variable **var, char **value,
125 static bool close_reader (struct sfm_reader *r);
127 /* Dictionary reader. */
135 static void read_header (struct sfm_reader *, struct dictionary *,
136 int *weight_idx, int *claimed_oct_cnt,
137 struct sfm_read_info *);
138 static void read_variable_record (struct sfm_reader *, struct dictionary *,
139 int *format_warning_cnt);
140 static void parse_format_spec (struct sfm_reader *, unsigned int,
141 enum which_format, struct variable *,
142 int *format_warning_cnt);
143 static void setup_weight (struct sfm_reader *, int weight_idx,
144 struct variable **var_by_value_idx,
145 struct dictionary *);
146 static void read_documents (struct sfm_reader *, struct dictionary *);
147 static void read_value_labels (struct sfm_reader *, struct dictionary *,
148 struct variable **var_by_value_idx);
150 static void read_extension_record (struct sfm_reader *, struct dictionary *,
151 struct sfm_read_info *);
152 static void read_machine_integer_info (struct sfm_reader *,
153 size_t size, size_t count,
154 struct sfm_read_info *);
155 static void read_machine_float_info (struct sfm_reader *,
156 size_t size, size_t count);
157 static void read_display_parameters (struct sfm_reader *,
158 size_t size, size_t count,
159 struct dictionary *);
160 static void read_long_var_name_map (struct sfm_reader *,
161 size_t size, size_t count,
162 struct dictionary *);
163 static void read_long_string_map (struct sfm_reader *,
164 size_t size, size_t count,
165 struct dictionary *);
168 /* Opens the system file designated by file handle FH for
169 reading. Reads the system file's dictionary into *DICT.
170 If INFO is non-null, then it receives additional info about the
173 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
174 struct sfm_read_info *volatile info)
176 struct sfm_reader *volatile r = NULL;
177 struct variable **var_by_value_idx;
178 struct sfm_read_info local_info;
179 int format_warning_cnt = 0;
184 *dict = dict_create ();
186 /* Create and initialize reader. */
187 r = pool_create_container (struct sfm_reader, pool);
193 r->has_long_var_names = false;
194 r->opcode_idx = sizeof r->opcodes;
196 /* TRANSLATORS: this fragment will be interpolated into
197 messages in fh_lock() that identify types of files. */
198 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
202 r->file = fn_open (fh_get_file_name (fh), "rb");
205 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
206 fh_get_file_name (r->fh), strerror (errno));
210 /* Initialize info. */
213 memset (info, 0, sizeof *info);
215 if (setjmp (r->bail_out))
220 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
222 /* Read all the variable definition records. */
223 rec_type = read_int (r);
224 while (rec_type == 2)
226 read_variable_record (r, *dict, &format_warning_cnt);
227 rec_type = read_int (r);
230 /* Figure out the case format. */
231 var_by_value_idx = make_var_by_value_idx (r, *dict);
232 setup_weight (r, weight_idx, var_by_value_idx, *dict);
234 /* Read all the rest of the dictionary records. */
235 while (rec_type != 999)
240 read_value_labels (r, *dict, var_by_value_idx);
244 sys_error (r, _("Misplaced type 4 record."));
247 read_documents (r, *dict);
251 read_extension_record (r, *dict, info);
255 sys_error (r, _("Unrecognized record type %d."), rec_type);
257 rec_type = read_int (r);
261 if ( ! r->has_long_var_names )
264 for (i = 0; i < dict_get_var_cnt (*dict); i++)
266 struct variable *var = dict_get_var (*dict, i);
267 char short_name[SHORT_NAME_LEN + 1];
268 char long_name[SHORT_NAME_LEN + 1];
270 strcpy (short_name, var_get_name (var));
272 strcpy (long_name, short_name);
273 str_lowercase (long_name);
275 /* Set long name. Renaming a variable may clear the short
276 name, but we want to retain it, so re-set it
278 dict_rename_var (*dict, var, long_name);
279 var_set_short_name (var, 0, short_name);
282 r->has_long_var_names = true;
285 /* Read record 999 data, which is just filler. */
288 /* Warn if the actual amount of data per case differs from the
289 amount that the header claims. SPSS version 13 gets this
290 wrong when very long strings are involved, so don't warn in
292 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
293 && info->version_major != 13)
294 sys_warn (r, _("File header claims %d variable positions but "
295 "%d were read from file."),
296 claimed_oct_cnt, r->oct_cnt);
298 /* Create an index of dictionary variable widths for
299 sfm_read_case to use. We cannot use the `struct variable's
300 from the dictionary we created, because the caller owns the
301 dictionary and may destroy or modify its variables. */
302 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
303 pool_register (r->pool, free, r->sfm_vars);
305 pool_free (r->pool, var_by_value_idx);
306 r->value_cnt = dict_get_next_value_idx (*dict);
307 return casereader_create_sequential
309 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
310 &sys_file_casereader_class, r);
314 dict_destroy (*dict);
319 /* Closes a system file after we're done with it.
320 Returns true if an I/O error has occurred on READER, false
323 close_reader (struct sfm_reader *r)
332 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
334 msg (ME, _("Error closing system file \"%s\": %s."),
335 fh_get_file_name (r->fh), strerror (errno));
345 pool_destroy (r->pool);
350 /* Destroys READER. */
352 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
354 struct sfm_reader *r = r_;
358 /* Returns true if FILE is an SPSS system file,
361 sfm_detect (FILE *file)
365 if (fread (rec_type, 4, 1, file) != 1)
369 return !strcmp ("$FL2", rec_type);
372 /* Reads the global header of the system file.
373 Sets DICT's file label to the system file's label.
374 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
375 or to the value index of the weight variable otherwise.
376 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
377 per case that the file claims to have (although it is not
379 Initializes INFO with header information. */
381 read_header (struct sfm_reader *r, struct dictionary *dict,
382 int *weight_idx, int *claimed_oct_cnt,
383 struct sfm_read_info *info)
386 char eye_catcher[61];
387 uint8_t raw_layout_code[4];
389 char creation_date[10];
390 char creation_time[9];
392 struct substring file_label_ss;
393 struct substring product;
395 read_string (r, rec_type, sizeof rec_type);
396 read_string (r, eye_catcher, sizeof eye_catcher);
398 if (strcmp ("$FL2", rec_type) != 0)
399 sys_error (r, _("This is not an SPSS system file."));
401 /* Identify integer format. */
402 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
403 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
405 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
407 || (r->integer_format != INTEGER_MSB_FIRST
408 && r->integer_format != INTEGER_LSB_FIRST))
409 sys_error (r, _("This is not an SPSS system file."));
411 *claimed_oct_cnt = read_int (r);
412 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
413 *claimed_oct_cnt = -1;
415 r->compressed = read_int (r) != 0;
417 *weight_idx = read_int (r);
419 r->case_cnt = read_int (r);
420 if ( r->case_cnt > INT_MAX / 2)
424 /* Identify floating-point format and obtain compression bias. */
425 read_bytes (r, raw_bias, sizeof raw_bias);
426 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
428 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
430 if (memcmp (raw_bias, zero_bias, 8))
431 sys_warn (r, _("Compression bias is not the usual "
432 "value of 100, or system file uses unrecognized "
433 "floating-point format."));
436 /* Some software is known to write all-zeros to this
437 field. Such software also writes floating-point
438 numbers in the format that we expect by default
439 (it seems that all software most likely does, in
440 reality), so don't warn in this case. */
443 if (r->integer_format == INTEGER_MSB_FIRST)
444 r->float_format = FLOAT_IEEE_DOUBLE_BE;
446 r->float_format = FLOAT_IEEE_DOUBLE_LE;
448 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
450 read_string (r, creation_date, sizeof creation_date);
451 read_string (r, creation_time, sizeof creation_time);
452 read_string (r, file_label, sizeof file_label);
455 file_label_ss = ss_cstr (file_label);
456 ss_trim (&file_label_ss, ss_cstr (" "));
457 if (!ss_is_empty (file_label_ss))
459 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
460 dict_set_label (dict, ss_data (file_label_ss));
463 strcpy (info->creation_date, creation_date);
464 strcpy (info->creation_time, creation_time);
465 info->integer_format = r->integer_format;
466 info->float_format = r->float_format;
467 info->compressed = r->compressed;
468 info->case_cnt = r->case_cnt;
470 product = ss_cstr (eye_catcher);
471 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
472 ss_trim (&product, ss_cstr (" "));
473 str_copy_buf_trunc (info->product, sizeof info->product,
474 ss_data (product), ss_length (product));
477 /* Reads a variable (type 2) record from R and adds the
478 corresponding variable to DICT.
479 Also skips past additional variable records for long string
482 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
483 int *format_warning_cnt)
486 int has_variable_label;
487 int missing_value_code;
492 struct variable *var;
495 width = read_int (r);
496 has_variable_label = read_int (r);
497 missing_value_code = read_int (r);
498 print_format = read_int (r);
499 write_format = read_int (r);
500 read_string (r, name, sizeof name);
501 name[strcspn (name, " ")] = '\0';
503 /* Check variable name. */
504 if (name[0] == '$' || name[0] == '#')
505 sys_error (r, "Variable name begins with invalid character `%c'.",
507 if (!var_is_plausible_name (name, false))
508 sys_error (r, _("Invalid variable name `%s'."), name);
510 /* Create variable. */
511 if (width < 0 || width > 255)
512 sys_error (r, _("Bad variable width %d."), width);
513 var = dict_create_var (dict, name, width);
516 _("Duplicate variable name `%s' within system file."),
519 /* Set the short name the same as the long name. */
520 var_set_short_name (var, 0, var_get_name (var));
522 /* Get variable label, if any. */
523 if (has_variable_label != 0 && has_variable_label != 1)
524 sys_error (r, _("Variable label indicator field is not 0 or 1."));
525 if (has_variable_label == 1)
531 if (len >= sizeof label)
532 sys_error (r, _("Variable %s has label of invalid length %zu."),
534 read_string (r, label, len + 1);
535 var_set_label (var, label);
537 skip_bytes (r, ROUND_UP (len, 4) - len);
540 /* Set missing values. */
541 if (missing_value_code != 0)
543 struct missing_values mv;
546 mv_init (&mv, var_get_width (var));
547 if (var_is_numeric (var))
549 if (missing_value_code < -3 || missing_value_code > 3
550 || missing_value_code == -1)
551 sys_error (r, _("Numeric missing value indicator field is not "
552 "-3, -2, 0, 1, 2, or 3."));
553 if (missing_value_code < 0)
555 double low = read_float (r);
556 double high = read_float (r);
557 mv_add_range (&mv, low, high);
558 missing_value_code = -missing_value_code - 2;
560 for (i = 0; i < missing_value_code; i++)
561 mv_add_num (&mv, read_float (r));
565 if (missing_value_code < 1 || missing_value_code > 3)
566 sys_error (r, _("String missing value indicator field is not "
568 if (var_is_long_string (var))
569 sys_warn (r, _("Ignoring missing values on long string variable "
570 "%s, which PSPP does not yet support."), name);
571 for (i = 0; i < missing_value_code; i++)
574 read_string (r, string, sizeof string);
575 mv_add_str (&mv, string);
578 if (!var_is_long_string (var))
579 var_set_missing_values (var, &mv);
583 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
584 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
586 /* Account for values.
587 Skip long string continuation records, if any. */
588 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
594 for (i = 1; i < nv; i++)
596 /* Check for record type 2 and width -1. */
597 if (read_int (r) != 2 || read_int (r) != -1)
598 sys_error (r, _("Missing string continuation record."));
600 /* Skip and ignore remaining continuation data. */
601 has_variable_label = read_int (r);
602 missing_value_code = read_int (r);
603 print_format = read_int (r);
604 write_format = read_int (r);
605 read_string (r, name, sizeof name);
607 /* Variable label fields on continuation records have
608 been spotted in system files created by "SPSS Power
609 Macintosh Release 6.1". */
610 if (has_variable_label)
611 skip_bytes (r, ROUND_UP (read_int (r), 4));
616 /* Translates the format spec from sysfile format to internal
619 parse_format_spec (struct sfm_reader *r, unsigned int s,
620 enum which_format which, struct variable *v,
621 int *format_warning_cnt)
623 const int max_format_warnings = 8;
625 uint8_t raw_type = s >> 16;
631 if (!fmt_from_io (raw_type, &f.type))
632 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
637 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
642 if (which == PRINT_FORMAT)
643 var_set_print_format (v, &f);
645 var_set_write_format (v, &f);
647 else if (*++format_warning_cnt <= max_format_warnings)
649 char fmt_string[FMT_STRING_LEN_MAX + 1];
650 sys_warn (r, _("%s variable %s has invalid %s format %s."),
651 var_is_numeric (v) ? _("Numeric") : _("String"),
653 which == PRINT_FORMAT ? _("print") : _("write"),
654 fmt_to_string (&f, fmt_string));
656 if (*format_warning_cnt == max_format_warnings)
657 sys_warn (r, _("Suppressing further invalid format warnings."));
661 /* Sets the weighting variable in DICT to the variable
662 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
665 setup_weight (struct sfm_reader *r, int weight_idx,
666 struct variable **var_by_value_idx, struct dictionary *dict)
670 struct variable *weight_var
671 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
672 if (var_is_numeric (weight_var))
673 dict_set_weight (dict, weight_var);
675 sys_error (r, _("Weighting variable must be numeric."));
679 /* Reads a document record, type 6, from system file R, and sets up
680 the documents and n_documents fields in the associated
683 read_documents (struct sfm_reader *r, struct dictionary *dict)
688 if (dict_get_documents (dict) != NULL)
689 sys_error (r, _("Multiple type 6 (document) records."));
691 line_cnt = read_int (r);
693 sys_error (r, _("Number of document lines (%d) "
694 "must be greater than 0."), line_cnt);
696 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
697 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
698 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
699 dict_set_documents (dict, documents);
701 sys_error (r, _("Document line contains null byte."));
702 pool_free (r->pool, documents);
705 /* Read a type 7 extension record. */
707 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
708 struct sfm_read_info *info)
710 int subtype = read_int (r);
711 size_t size = read_int (r);
712 size_t count = read_int (r);
713 size_t bytes = size * count;
715 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
716 allows an extra byte for a null terminator, used by some
717 extension processing routines. */
718 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
719 sys_error (r, "Record type 7 subtype %d too large.", subtype);
724 read_machine_integer_info (r, size, count, info);
728 read_machine_float_info (r, size, count);
732 /* Variable sets information. We don't use these yet.
733 They only apply to GUIs; see VARSETS on the APPLY
734 DICTIONARY command in SPSS documentation. */
738 /* DATE variable information. We don't use it yet, but we
743 /* Used by the MRSETS command. */
747 /* Used by the SPSS Data Entry software. */
751 read_display_parameters (r, size, count, dict);
755 read_long_var_name_map (r, size, count, dict);
759 read_long_string_map (r, size, count, dict);
763 /* New in SPSS v14? Unknown purpose. */
767 /* Text field that defines variable attributes. New in
772 /* New in SPSS 16. Contains a single string that describes
773 the character encoding, e.g. "windows-1252". */
777 /* New in SPSS 16. Encodes value labels for long string
779 sys_warn (r, _("Ignoring value labels for long string variables, "
780 "which PSPP does not yet support."));
784 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"),
785 subtype, PACKAGE_BUGREPORT);
789 skip_bytes (r, bytes);
792 /* Read record type 7, subtype 3. */
794 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
795 struct sfm_read_info *info)
797 int version_major = read_int (r);
798 int version_minor = read_int (r);
799 int version_revision = read_int (r);
800 int machine_code UNUSED = read_int (r);
801 int float_representation = read_int (r);
802 int compression_code UNUSED = read_int (r);
803 int integer_representation = read_int (r);
804 int character_code UNUSED = read_int (r);
806 int expected_float_format;
807 int expected_integer_format;
809 if (size != 4 || count != 8)
810 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
814 /* Save version info. */
815 info->version_major = version_major;
816 info->version_minor = version_minor;
817 info->version_revision = version_revision;
819 /* Check floating point format. */
820 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
821 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
822 expected_float_format = 1;
823 else if (r->float_format == FLOAT_Z_LONG)
824 expected_float_format = 2;
825 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
826 expected_float_format = 3;
829 if (float_representation != expected_float_format)
830 sys_error (r, _("Floating-point representation indicated by "
831 "system file (%d) differs from expected (%d)."),
832 r->float_format, expected_float_format);
834 /* Check integer format. */
835 if (r->integer_format == INTEGER_MSB_FIRST)
836 expected_integer_format = 1;
837 else if (r->integer_format == INTEGER_LSB_FIRST)
838 expected_integer_format = 2;
841 if (integer_representation != expected_integer_format)
843 static const char *const endian[] = {N_("little-endian"), N_("big-endian")};
844 sys_warn (r, _("Integer format indicated by system file (%s) "
845 "differs from expected (%s)."),
846 gettext (endian[integer_representation == 1]),
847 gettext (endian[expected_integer_format == 1]));
851 /* Read record type 7, subtype 4. */
853 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
855 double sysmis = read_float (r);
856 double highest = read_float (r);
857 double lowest = read_float (r);
859 if (size != 8 || count != 3)
860 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
863 if (sysmis != SYSMIS)
864 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
865 if (highest != HIGHEST)
866 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
867 if (lowest != LOWEST)
868 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
871 /* Read record type 7, subtype 11, which specifies how variables
872 should be displayed in GUI environments. */
874 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
875 struct dictionary *dict)
884 sys_warn (r, _("Bad size %zu on extension 11."), size);
885 skip_bytes (r, size * count);
889 n_vars = dict_get_var_cnt (dict);
890 if (count == 3 * n_vars)
891 includes_width = true;
892 else if (count == 2 * n_vars)
893 includes_width = false;
896 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
898 skip_bytes (r, size * count);
902 for (i = 0; i < n_vars; ++i)
904 struct variable *v = dict_get_var (dict, i);
905 int measure = read_int (r);
906 int width = includes_width ? read_int (r) : 0;
907 int align = read_int (r);
909 /* SPSS 14 sometimes seems to set string variables' measure
911 if (0 == measure && var_is_alpha (v))
914 if (measure < 1 || measure > 3 || align < 0 || align > 2)
917 sys_warn (r, _("Invalid variable display parameters "
918 "for variable %zu (%s). "
919 "Default parameters substituted."),
920 i, var_get_name (v));
925 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
926 : measure == 2 ? MEASURE_ORDINAL
928 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
929 : align == 1 ? ALIGN_RIGHT
932 /* Older versions (SPSS 9.0) sometimes set the display
933 width to zero. This causes confusion in the GUI, so
934 only set the width if it is nonzero. */
936 var_set_display_width (v, width);
940 /* Reads record type 7, subtype 13, which gives the long name
941 that corresponds to each short name. Modifies variable names
942 in DICT accordingly. */
944 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
945 struct dictionary *dict)
947 struct variable_to_value_map *map;
948 struct variable *var;
952 map = open_variable_to_value_map (r, size * count);
953 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
957 size_t short_name_cnt;
960 /* Validate long name. */
961 if (!var_is_valid_name (long_name, false))
963 sys_warn (r, _("Long variable mapping from %s to invalid "
964 "variable name `%s'."),
965 var_get_name (var), long_name);
969 /* Identify any duplicates. */
970 if (strcasecmp (var_get_short_name (var, 0), long_name)
971 && dict_lookup_var (dict, long_name) != NULL)
973 sys_warn (r, _("Duplicate long variable name `%s' "
974 "within system file."), long_name);
978 /* Renaming a variable may clear its short names, but we
979 want to retain them, so we save them and re-set them
981 short_name_cnt = var_get_short_name_cnt (var);
982 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
983 for (i = 0; i < short_name_cnt; i++)
985 const char *s = var_get_short_name (var, i);
986 short_names[i] = s != NULL ? xstrdup (s) : NULL;
990 dict_rename_var (dict, var, long_name);
992 /* Restore short names. */
993 for (i = 0; i < short_name_cnt; i++)
995 var_set_short_name (var, i, short_names[i]);
996 free (short_names[i]);
1000 close_variable_to_value_map (r, map);
1001 r->has_long_var_names = true;
1004 /* Reads record type 7, subtype 14, which gives the real length
1005 of each very long string. Rearranges DICT accordingly. */
1007 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1008 struct dictionary *dict)
1010 struct variable_to_value_map *map;
1011 struct variable *var;
1013 int warning_cnt = 0;
1015 map = open_variable_to_value_map (r, size * count);
1016 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
1019 size_t idx = var_get_dict_index (var);
1025 length = strtol (length_s, NULL, 10);
1026 if (length < 1 || length > MAX_STRING)
1028 sys_warn (r, _("%s listed as string of invalid length %s "
1029 "in very length string record."),
1030 var_get_name (var), length_s);
1034 /* Check segments. */
1035 segment_cnt = sfm_width_to_segments (length);
1036 if (segment_cnt == 1)
1038 sys_warn (r, _("%s listed in very long string record with width %s, "
1039 "which requires only one segment."),
1040 var_get_name (var), length_s);
1043 if (idx + segment_cnt > dict_get_var_cnt (dict))
1044 sys_error (r, _("Very long string %s overflows dictionary."),
1045 var_get_name (var));
1047 /* Get the short names from the segments and check their
1049 for (i = 0; i < segment_cnt; i++)
1051 struct variable *seg = dict_get_var (dict, idx + i);
1052 int alloc_width = sfm_segment_alloc_width (length, i);
1053 int width = var_get_width (seg);
1056 var_set_short_name (var, i, var_get_short_name (seg, 0));
1057 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1058 sys_error (r, _("Very long string with width %ld has segment %d "
1059 "of width %d (expected %d)"),
1060 length, i, width, alloc_width);
1062 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1063 var_set_width (var, length);
1065 close_variable_to_value_map (r, map);
1066 dict_compact_values (dict);
1069 /* Reads value labels from sysfile H and inserts them into the
1070 associated dictionary. */
1072 read_value_labels (struct sfm_reader *r,
1073 struct dictionary *dict, struct variable **var_by_value_idx)
1075 struct pool *subpool;
1079 char raw_value[8]; /* Value as uninterpreted bytes. */
1080 union value value; /* Value. */
1081 char *label; /* Null-terminated label string. */
1084 struct label *labels = NULL;
1085 int label_cnt; /* Number of labels. */
1087 struct variable **var = NULL; /* Associated variables. */
1088 int var_cnt; /* Number of associated variables. */
1092 subpool = pool_create_subpool (r->pool);
1094 /* Read the type 3 record and record its contents. We can't do
1095 much with the data yet because we don't know whether it is
1096 of numeric or string type. */
1098 /* Read number of labels. */
1099 label_cnt = read_int (r);
1101 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1103 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1108 /* Read each value/label tuple into labels[]. */
1109 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1110 for (i = 0; i < label_cnt; i++)
1112 struct label *label = labels + i;
1113 unsigned char label_len;
1117 read_bytes (r, label->raw_value, sizeof label->raw_value);
1119 /* Read label length. */
1120 read_bytes (r, &label_len, sizeof label_len);
1121 padded_len = ROUND_UP (label_len + 1, 8);
1123 /* Read label, padding. */
1124 label->label = pool_alloc (subpool, padded_len + 1);
1125 read_bytes (r, label->label, padded_len - 1);
1126 label->label[label_len] = 0;
1129 /* Now, read the type 4 record that has the list of variables
1130 to which the value labels are to be applied. */
1132 /* Read record type of type 4 record. */
1133 if (read_int (r) != 4)
1134 sys_error (r, _("Variable index record (type 4) does not immediately "
1135 "follow value label record (type 3) as it should."));
1137 /* Read number of variables associated with value label from type 4
1139 var_cnt = read_int (r);
1140 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1141 sys_error (r, _("Number of variables associated with a value label (%d) "
1142 "is not between 1 and the number of variables (%zu)."),
1143 var_cnt, dict_get_var_cnt (dict));
1145 /* Read the list of variables. */
1146 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1147 for (i = 0; i < var_cnt; i++)
1149 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1150 if (var_is_long_string (var[i]))
1151 sys_error (r, _("Value labels are not allowed on long string "
1152 "variables (%s)."), var_get_name (var[i]));
1155 /* Type check the variables. */
1156 for (i = 1; i < var_cnt; i++)
1157 if (var_get_type (var[i]) != var_get_type (var[0]))
1158 sys_error (r, _("Variables associated with value label are not all of "
1159 "identical type. Variable %s is %s, but variable "
1161 var_get_name (var[0]),
1162 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1163 var_get_name (var[i]),
1164 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1166 /* Fill in labels[].value, now that we know the desired type. */
1167 for (i = 0; i < label_cnt; i++)
1169 struct label *label = labels + i;
1171 if (var_is_alpha (var[0]))
1172 buf_copy_rpad (label->value.s, sizeof label->value.s,
1173 label->raw_value, sizeof label->raw_value);
1175 label->value.f = float_get_double (r->float_format, label->raw_value);
1178 /* Assign the `value_label's to each variable. */
1179 for (i = 0; i < var_cnt; i++)
1181 struct variable *v = var[i];
1184 /* Add each label to the variable. */
1185 for (j = 0; j < label_cnt; j++)
1187 struct label *label = &labels[j];
1188 if (!var_add_value_label (v, &label->value, label->label))
1190 if (var_is_numeric (var[0]))
1191 sys_warn (r, _("Duplicate value label for %g on %s."),
1192 label->value.f, var_get_name (v));
1194 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1195 var_get_width (v), label->value.s,
1201 pool_destroy (subpool);
1206 static void partial_record (struct sfm_reader *r)
1209 static void read_error (struct casereader *, const struct sfm_reader *);
1211 static bool read_case_number (struct sfm_reader *, double *);
1212 static bool read_case_string (struct sfm_reader *, char *, size_t);
1213 static int read_opcode (struct sfm_reader *);
1214 static bool read_compressed_number (struct sfm_reader *, double *);
1215 static bool read_compressed_string (struct sfm_reader *, char *);
1216 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1217 static bool skip_whole_strings (struct sfm_reader *, size_t);
1219 /* Reads one case from READER's file into C. Returns true only
1222 sys_file_casereader_read (struct casereader *reader, void *r_,
1225 struct sfm_reader *r = r_;
1231 case_create (c, r->value_cnt);
1232 if (setjmp (r->bail_out))
1234 casereader_force_error (reader);
1239 for (i = 0; i < r->sfm_var_cnt; i++)
1241 struct sfm_var *sv = &r->sfm_vars[i];
1242 union value *v = case_data_rw_idx (c, sv->case_index);
1246 if (!read_case_number (r, &v->f))
1251 if (!read_case_string (r, v->s + sv->offset, sv->width))
1253 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1263 if (r->case_cnt != -1)
1264 read_error (reader, r);
1268 /* Issues an error that R ends in a partial record. */
1270 partial_record (struct sfm_reader *r)
1272 sys_error (r, _("File ends in partial case."));
1275 /* Issues an error that an unspecified error occurred SFM, and
1278 read_error (struct casereader *r, const struct sfm_reader *sfm)
1280 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1281 casereader_force_error (r);
1284 /* Reads a number from R and stores its value in *D.
1285 If R is compressed, reads a compressed number;
1286 otherwise, reads a number in the regular way.
1287 Returns true if successful, false if end of file is
1288 reached immediately. */
1290 read_case_number (struct sfm_reader *r, double *d)
1295 if (!try_read_bytes (r, number, sizeof number))
1297 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1301 return read_compressed_number (r, d);
1304 /* Reads LENGTH string bytes from R into S.
1305 Always reads a multiple of 8 bytes; if LENGTH is not a
1306 multiple of 8, then extra bytes are read and discarded without
1308 Reads compressed strings if S is compressed.
1309 Returns true if successful, false if end of file is
1310 reached immediately. */
1312 read_case_string (struct sfm_reader *r, char *s, size_t length)
1314 size_t whole = ROUND_DOWN (length, 8);
1315 size_t partial = length % 8;
1319 if (!read_whole_strings (r, s, whole))
1326 if (!read_whole_strings (r, bounce, sizeof bounce))
1332 memcpy (s + whole, bounce, partial);
1338 /* Reads and returns the next compression opcode from R. */
1340 read_opcode (struct sfm_reader *r)
1342 assert (r->compressed);
1346 if (r->opcode_idx >= sizeof r->opcodes)
1348 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1352 opcode = r->opcodes[r->opcode_idx++];
1359 /* Reads a compressed number from R and stores its value in D.
1360 Returns true if successful, false if end of file is
1361 reached immediately. */
1363 read_compressed_number (struct sfm_reader *r, double *d)
1365 int opcode = read_opcode (r);
1373 *d = read_float (r);
1377 sys_error (r, _("Compressed data is corrupt."));
1384 *d = opcode - r->bias;
1391 /* Reads a compressed 8-byte string segment from R and stores it
1393 Returns true if successful, false if end of file is
1394 reached immediately. */
1396 read_compressed_string (struct sfm_reader *r, char *dst)
1398 switch (read_opcode (r))
1405 read_bytes (r, dst, 8);
1409 memset (dst, ' ', 8);
1413 sys_error (r, _("Compressed data is corrupt."));
1419 /* Reads LENGTH string bytes from R into S.
1420 LENGTH must be a multiple of 8.
1421 Reads compressed strings if S is compressed.
1422 Returns true if successful, false if end of file is
1423 reached immediately. */
1425 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1427 assert (length % 8 == 0);
1429 return try_read_bytes (r, s, length);
1433 for (ofs = 0; ofs < length; ofs += 8)
1434 if (!read_compressed_string (r, s + ofs))
1444 /* Skips LENGTH string bytes from R.
1445 LENGTH must be a multiple of 8.
1446 (LENGTH is also limited to 1024, but that's only because the
1447 current caller never needs more than that many bytes.)
1448 Returns true if successful, false if end of file is
1449 reached immediately. */
1451 skip_whole_strings (struct sfm_reader *r, size_t length)
1454 assert (length < sizeof buffer);
1455 return read_whole_strings (r, buffer, length);
1458 /* Creates and returns a table that can be used for translating a value
1459 index into a case to a "struct variable *" for DICT. Multiple
1460 system file fields reference variables this way.
1462 This table must be created before processing the very long
1463 string extension record, because that record causes some
1464 values to be deleted from the case and the dictionary to be
1466 static struct variable **
1467 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1469 struct variable **var_by_value_idx;
1473 var_by_value_idx = pool_nmalloc (r->pool,
1474 r->oct_cnt, sizeof *var_by_value_idx);
1475 for (i = 0; i < dict_get_var_cnt (dict); i++)
1477 struct variable *v = dict_get_var (dict, i);
1478 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1481 var_by_value_idx[value_idx++] = v;
1482 for (j = 1; j < nv; j++)
1483 var_by_value_idx[value_idx++] = NULL;
1485 assert (value_idx == r->oct_cnt);
1487 return var_by_value_idx;
1490 /* Returns the "struct variable" corresponding to the given
1491 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1493 static struct variable *
1494 lookup_var_by_value_idx (struct sfm_reader *r,
1495 struct variable **var_by_value_idx, int value_idx)
1497 struct variable *var;
1499 if (value_idx < 1 || value_idx > r->oct_cnt)
1500 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1501 value_idx, r->oct_cnt);
1503 var = var_by_value_idx[value_idx - 1];
1505 sys_error (r, _("Variable index %d refers to long string "
1512 /* Returns the variable in D with the given SHORT_NAME,
1513 or a null pointer if there is none. */
1514 static struct variable *
1515 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1517 struct variable *var;
1521 /* First try looking up by full name. This often succeeds. */
1522 var = dict_lookup_var (d, short_name);
1523 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
1526 /* Iterate through the whole dictionary as a fallback. */
1527 var_cnt = dict_get_var_cnt (d);
1528 for (i = 0; i < var_cnt; i++)
1530 var = dict_get_var (d, i);
1531 if (!strcasecmp (var_get_short_name (var, 0), short_name))
1538 /* Helpers for reading records that contain "variable=value"
1542 struct variable_to_value_map
1544 struct substring buffer; /* Record contents. */
1545 size_t pos; /* Current position in buffer. */
1548 /* Reads SIZE bytes into a "variable=value" map for R,
1549 and returns the map. */
1550 static struct variable_to_value_map *
1551 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1553 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1554 char *buffer = pool_malloc (r->pool, size + 1);
1555 read_bytes (r, buffer, size);
1556 map->buffer = ss_buffer (buffer, size);
1561 /* Closes MAP and frees its storage.
1562 Not really needed, because the pool will free the map anyway,
1563 but can be used to free it earlier. */
1565 close_variable_to_value_map (struct sfm_reader *r,
1566 struct variable_to_value_map *map)
1568 pool_free (r->pool, ss_data (map->buffer));
1571 /* Reads the next variable=value pair from MAP.
1572 Looks up the variable in DICT and stores it into *VAR.
1573 Stores a null-terminated value into *VALUE. */
1575 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1576 struct variable_to_value_map *map,
1577 struct variable **var, char **value,
1580 int max_warnings = 5;
1584 struct substring short_name_ss, value_ss;
1586 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1587 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1590 if (*warning_cnt > max_warnings)
1591 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1592 *warning_cnt - max_warnings);
1596 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1597 ss_buffer ("\t\0", 2));
1599 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1600 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1603 if (++*warning_cnt <= max_warnings)
1604 sys_warn (r, _("Variable map refers to unknown variable %s."),
1605 ss_data (short_name_ss));
1609 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1610 *value = ss_data (value_ss);
1618 /* Displays a corruption message. */
1620 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1625 ds_init_empty (&text);
1626 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1627 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1628 ds_put_vformat (&text, format, args);
1630 m.category = msg_class_to_category (class);
1631 m.severity = msg_class_to_severity (class);
1632 m.where.file_name = NULL;
1633 m.where.line_number = 0;
1634 m.text = ds_cstr (&text);
1639 /* Displays a warning for the current file position. */
1641 sys_warn (struct sfm_reader *r, const char *format, ...)
1645 va_start (args, format);
1646 sys_msg (r, MW, format, args);
1650 /* Displays an error for the current file position,
1651 marks it as in an error state,
1652 and aborts reading it using longjmp. */
1654 sys_error (struct sfm_reader *r, const char *format, ...)
1658 va_start (args, format);
1659 sys_msg (r, ME, format, args);
1663 longjmp (r->bail_out, 1);
1666 /* Reads BYTE_CNT bytes into BUF.
1667 Returns true if exactly BYTE_CNT bytes are successfully read.
1668 Aborts if an I/O error or a partial read occurs.
1669 If EOF_IS_OK, then an immediate end-of-file causes false to be
1670 returned; otherwise, immediate end-of-file causes an abort
1673 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1674 void *buf, size_t byte_cnt)
1676 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1677 if (bytes_read == byte_cnt)
1679 else if (ferror (r->file))
1680 sys_error (r, _("System error: %s."), strerror (errno));
1681 else if (!eof_is_ok || bytes_read != 0)
1682 sys_error (r, _("Unexpected end of file."));
1687 /* Reads BYTE_CNT into BUF.
1688 Aborts upon I/O error or if end-of-file is encountered. */
1690 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1692 read_bytes_internal (r, false, buf, byte_cnt);
1695 /* Reads BYTE_CNT bytes into BUF.
1696 Returns true if exactly BYTE_CNT bytes are successfully read.
1697 Returns false if an immediate end-of-file is encountered.
1698 Aborts if an I/O error or a partial read occurs. */
1700 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1702 return read_bytes_internal (r, true, buf, byte_cnt);
1705 /* Reads a 32-bit signed integer from R and returns its value in
1708 read_int (struct sfm_reader *r)
1711 read_bytes (r, integer, sizeof integer);
1712 return integer_get (r->integer_format, integer, sizeof integer);
1715 /* Reads a 64-bit floating-point number from R and returns its
1716 value in host format. */
1718 read_float (struct sfm_reader *r)
1721 read_bytes (r, number, sizeof number);
1722 return float_get_double (r->float_format, number);
1725 /* Reads exactly SIZE - 1 bytes into BUFFER
1726 and stores a null byte into BUFFER[SIZE - 1]. */
1728 read_string (struct sfm_reader *r, char *buffer, size_t size)
1731 read_bytes (r, buffer, size - 1);
1732 buffer[size - 1] = '\0';
1735 /* Skips BYTES bytes forward in R. */
1737 skip_bytes (struct sfm_reader *r, size_t bytes)
1742 size_t chunk = MIN (sizeof buffer, bytes);
1743 read_bytes (r, buffer, chunk);
1748 static const struct casereader_class sys_file_casereader_class =
1750 sys_file_casereader_read,
1751 sys_file_casereader_destroy,