1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/assertion.h>
29 #include <libpspp/message.h>
30 #include <libpspp/compiler.h>
31 #include <libpspp/misc.h>
32 #include <libpspp/pool.h>
33 #include <libpspp/str.h>
34 #include <libpspp/hash.h>
35 #include <libpspp/array.h>
37 #include <data/case.h>
38 #include <data/casereader-provider.h>
39 #include <data/casereader.h>
40 #include <data/dictionary.h>
41 #include <data/file-handle-def.h>
42 #include <data/file-name.h>
43 #include <data/format.h>
44 #include <data/missing-values.h>
45 #include <data/short-names.h>
46 #include <data/value-labels.h>
47 #include <data/variable.h>
48 #include <data/value.h>
53 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 struct fh_lock *lock; /* Mutual exclusion for file handle. */
71 FILE *file; /* File stream. */
72 bool error; /* I/O or corruption error? */
73 size_t value_cnt; /* Number of "union value"s in struct case. */
76 enum integer_format integer_format; /* On-disk integer format. */
77 enum float_format float_format; /* On-disk floating point format. */
78 int oct_cnt; /* Number of 8-byte units per case. */
79 struct sfm_var *sfm_vars; /* Variables. */
80 size_t sfm_var_cnt; /* Number of variables. */
81 casenumber case_cnt; /* Number of cases */
82 bool has_long_var_names; /* File has a long variable name map */
85 bool compressed; /* File is compressed? */
86 double bias; /* Compression bias, usually 100.0. */
87 uint8_t opcodes[8]; /* Current block of opcodes. */
88 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
89 bool corruption_warning; /* Warned about possible corruption? */
92 static const struct casereader_class sys_file_casereader_class;
94 static bool close_reader (struct sfm_reader *);
96 static struct variable **make_var_by_value_idx (struct sfm_reader *,
98 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
102 static void sys_warn (struct sfm_reader *, const char *, ...)
103 PRINTF_FORMAT (2, 3);
105 static void sys_error (struct sfm_reader *, const char *, ...)
109 static void read_bytes (struct sfm_reader *, void *, size_t);
110 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
111 static int read_int (struct sfm_reader *);
112 static double read_float (struct sfm_reader *);
113 static void read_string (struct sfm_reader *, char *, size_t);
114 static void skip_bytes (struct sfm_reader *, size_t);
116 static struct variable_to_value_map *open_variable_to_value_map (
117 struct sfm_reader *, size_t size);
118 static void close_variable_to_value_map (struct sfm_reader *r,
119 struct variable_to_value_map *);
120 static bool read_variable_to_value_map (struct sfm_reader *,
122 struct variable_to_value_map *,
123 struct variable **var, char **value,
126 static bool close_reader (struct sfm_reader *r);
128 /* Dictionary reader. */
136 static void read_header (struct sfm_reader *, struct dictionary *,
137 int *weight_idx, int *claimed_oct_cnt,
138 struct sfm_read_info *);
139 static void read_variable_record (struct sfm_reader *, struct dictionary *,
140 int *format_warning_cnt);
141 static void parse_format_spec (struct sfm_reader *, unsigned int,
142 enum which_format, struct variable *,
143 int *format_warning_cnt);
144 static void setup_weight (struct sfm_reader *, int weight_idx,
145 struct variable **var_by_value_idx,
146 struct dictionary *);
147 static void read_documents (struct sfm_reader *, struct dictionary *);
148 static void read_value_labels (struct sfm_reader *, struct dictionary *,
149 struct variable **var_by_value_idx);
151 static void read_extension_record (struct sfm_reader *, struct dictionary *,
152 struct sfm_read_info *);
153 static void read_machine_integer_info (struct sfm_reader *,
154 size_t size, size_t count,
155 struct sfm_read_info *);
156 static void read_machine_float_info (struct sfm_reader *,
157 size_t size, size_t count);
158 static void read_display_parameters (struct sfm_reader *,
159 size_t size, size_t count,
160 struct dictionary *);
161 static void read_long_var_name_map (struct sfm_reader *,
162 size_t size, size_t count,
163 struct dictionary *);
164 static void read_long_string_map (struct sfm_reader *,
165 size_t size, size_t count,
166 struct dictionary *);
169 /* Opens the system file designated by file handle FH for
170 reading. Reads the system file's dictionary into *DICT.
171 If INFO is non-null, then it receives additional info about the
174 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
175 struct sfm_read_info *volatile info)
177 struct sfm_reader *volatile r = NULL;
178 struct variable **var_by_value_idx;
179 struct sfm_read_info local_info;
180 int format_warning_cnt = 0;
185 *dict = dict_create ();
187 /* Create and initialize reader. */
188 r = pool_create_container (struct sfm_reader, pool);
194 r->has_long_var_names = false;
195 r->opcode_idx = sizeof r->opcodes;
196 r->corruption_warning = false;
198 /* TRANSLATORS: this fragment will be interpolated into
199 messages in fh_lock() that identify types of files. */
200 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
204 r->file = fn_open (fh_get_file_name (fh), "rb");
207 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
208 fh_get_file_name (r->fh), strerror (errno));
212 /* Initialize info. */
215 memset (info, 0, sizeof *info);
217 if (setjmp (r->bail_out))
222 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
224 /* Read all the variable definition records. */
225 rec_type = read_int (r);
226 while (rec_type == 2)
228 read_variable_record (r, *dict, &format_warning_cnt);
229 rec_type = read_int (r);
232 /* Figure out the case format. */
233 var_by_value_idx = make_var_by_value_idx (r, *dict);
234 setup_weight (r, weight_idx, var_by_value_idx, *dict);
236 /* Read all the rest of the dictionary records. */
237 while (rec_type != 999)
242 read_value_labels (r, *dict, var_by_value_idx);
246 sys_error (r, _("Misplaced type 4 record."));
249 read_documents (r, *dict);
253 read_extension_record (r, *dict, info);
257 sys_error (r, _("Unrecognized record type %d."), rec_type);
259 rec_type = read_int (r);
263 if ( ! r->has_long_var_names )
266 for (i = 0; i < dict_get_var_cnt (*dict); i++)
268 struct variable *var = dict_get_var (*dict, i);
269 char short_name[SHORT_NAME_LEN + 1];
270 char long_name[SHORT_NAME_LEN + 1];
272 strcpy (short_name, var_get_name (var));
274 strcpy (long_name, short_name);
275 str_lowercase (long_name);
277 /* Set long name. Renaming a variable may clear the short
278 name, but we want to retain it, so re-set it
280 dict_rename_var (*dict, var, long_name);
281 var_set_short_name (var, 0, short_name);
284 r->has_long_var_names = true;
287 /* Read record 999 data, which is just filler. */
290 /* Warn if the actual amount of data per case differs from the
291 amount that the header claims. SPSS version 13 gets this
292 wrong when very long strings are involved, so don't warn in
294 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
295 && info->version_major != 13)
296 sys_warn (r, _("File header claims %d variable positions but "
297 "%d were read from file."),
298 claimed_oct_cnt, r->oct_cnt);
300 /* Create an index of dictionary variable widths for
301 sfm_read_case to use. We cannot use the `struct variable's
302 from the dictionary we created, because the caller owns the
303 dictionary and may destroy or modify its variables. */
304 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
305 pool_register (r->pool, free, r->sfm_vars);
307 pool_free (r->pool, var_by_value_idx);
308 r->value_cnt = dict_get_next_value_idx (*dict);
309 return casereader_create_sequential
311 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
312 &sys_file_casereader_class, r);
316 dict_destroy (*dict);
321 /* Closes a system file after we're done with it.
322 Returns true if an I/O error has occurred on READER, false
325 close_reader (struct sfm_reader *r)
334 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
336 msg (ME, _("Error closing system file \"%s\": %s."),
337 fh_get_file_name (r->fh), strerror (errno));
347 pool_destroy (r->pool);
352 /* Destroys READER. */
354 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
356 struct sfm_reader *r = r_;
360 /* Returns true if FILE is an SPSS system file,
363 sfm_detect (FILE *file)
367 if (fread (rec_type, 4, 1, file) != 1)
371 return !strcmp ("$FL2", rec_type);
374 /* Reads the global header of the system file.
375 Sets DICT's file label to the system file's label.
376 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
377 or to the value index of the weight variable otherwise.
378 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
379 per case that the file claims to have (although it is not
381 Initializes INFO with header information. */
383 read_header (struct sfm_reader *r, struct dictionary *dict,
384 int *weight_idx, int *claimed_oct_cnt,
385 struct sfm_read_info *info)
388 char eye_catcher[61];
389 uint8_t raw_layout_code[4];
391 char creation_date[10];
392 char creation_time[9];
394 struct substring file_label_ss;
395 struct substring product;
397 read_string (r, rec_type, sizeof rec_type);
398 read_string (r, eye_catcher, sizeof eye_catcher);
400 if (strcmp ("$FL2", rec_type) != 0)
401 sys_error (r, _("This is not an SPSS system file."));
403 /* Identify integer format. */
404 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
405 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
407 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
409 || (r->integer_format != INTEGER_MSB_FIRST
410 && r->integer_format != INTEGER_LSB_FIRST))
411 sys_error (r, _("This is not an SPSS system file."));
413 *claimed_oct_cnt = read_int (r);
414 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
415 *claimed_oct_cnt = -1;
417 r->compressed = read_int (r) != 0;
419 *weight_idx = read_int (r);
421 r->case_cnt = read_int (r);
422 if ( r->case_cnt > INT_MAX / 2)
426 /* Identify floating-point format and obtain compression bias. */
427 read_bytes (r, raw_bias, sizeof raw_bias);
428 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
430 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
432 if (memcmp (raw_bias, zero_bias, 8))
433 sys_warn (r, _("Compression bias is not the usual "
434 "value of 100, or system file uses unrecognized "
435 "floating-point format."));
438 /* Some software is known to write all-zeros to this
439 field. Such software also writes floating-point
440 numbers in the format that we expect by default
441 (it seems that all software most likely does, in
442 reality), so don't warn in this case. */
445 if (r->integer_format == INTEGER_MSB_FIRST)
446 r->float_format = FLOAT_IEEE_DOUBLE_BE;
448 r->float_format = FLOAT_IEEE_DOUBLE_LE;
450 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
452 read_string (r, creation_date, sizeof creation_date);
453 read_string (r, creation_time, sizeof creation_time);
454 read_string (r, file_label, sizeof file_label);
457 file_label_ss = ss_cstr (file_label);
458 ss_trim (&file_label_ss, ss_cstr (" "));
459 if (!ss_is_empty (file_label_ss))
461 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
462 dict_set_label (dict, ss_data (file_label_ss));
465 strcpy (info->creation_date, creation_date);
466 strcpy (info->creation_time, creation_time);
467 info->integer_format = r->integer_format;
468 info->float_format = r->float_format;
469 info->compressed = r->compressed;
470 info->case_cnt = r->case_cnt;
472 product = ss_cstr (eye_catcher);
473 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
474 ss_trim (&product, ss_cstr (" "));
475 str_copy_buf_trunc (info->product, sizeof info->product,
476 ss_data (product), ss_length (product));
479 /* Reads a variable (type 2) record from R and adds the
480 corresponding variable to DICT.
481 Also skips past additional variable records for long string
484 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
485 int *format_warning_cnt)
488 int has_variable_label;
489 int missing_value_code;
494 struct variable *var;
497 width = read_int (r);
498 has_variable_label = read_int (r);
499 missing_value_code = read_int (r);
500 print_format = read_int (r);
501 write_format = read_int (r);
502 read_string (r, name, sizeof name);
503 name[strcspn (name, " ")] = '\0';
505 /* Check variable name. */
506 if (name[0] == '$' || name[0] == '#')
507 sys_error (r, "Variable name begins with invalid character `%c'.",
509 if (!var_is_plausible_name (name, false))
510 sys_error (r, _("Invalid variable name `%s'."), name);
512 /* Create variable. */
513 if (width < 0 || width > 255)
514 sys_error (r, _("Bad variable width %d."), width);
515 var = dict_create_var (dict, name, width);
518 _("Duplicate variable name `%s' within system file."),
521 /* Set the short name the same as the long name. */
522 var_set_short_name (var, 0, var_get_name (var));
524 /* Get variable label, if any. */
525 if (has_variable_label != 0 && has_variable_label != 1)
526 sys_error (r, _("Variable label indicator field is not 0 or 1."));
527 if (has_variable_label == 1)
533 if (len >= sizeof label)
534 sys_error (r, _("Variable %s has label of invalid length %zu."),
536 read_string (r, label, len + 1);
537 var_set_label (var, label);
539 skip_bytes (r, ROUND_UP (len, 4) - len);
542 /* Set missing values. */
543 if (missing_value_code != 0)
545 struct missing_values mv;
548 mv_init (&mv, var_get_width (var));
549 if (var_is_numeric (var))
551 if (missing_value_code < -3 || missing_value_code > 3
552 || missing_value_code == -1)
553 sys_error (r, _("Numeric missing value indicator field is not "
554 "-3, -2, 0, 1, 2, or 3."));
555 if (missing_value_code < 0)
557 double low = read_float (r);
558 double high = read_float (r);
559 mv_add_range (&mv, low, high);
560 missing_value_code = -missing_value_code - 2;
562 for (i = 0; i < missing_value_code; i++)
563 mv_add_num (&mv, read_float (r));
567 if (missing_value_code < 1 || missing_value_code > 3)
568 sys_error (r, _("String missing value indicator field is not "
570 if (var_is_long_string (var))
571 sys_warn (r, _("Ignoring missing values on long string variable "
572 "%s, which PSPP does not yet support."), name);
573 for (i = 0; i < missing_value_code; i++)
576 read_string (r, string, sizeof string);
577 mv_add_str (&mv, string);
580 if (!var_is_long_string (var))
581 var_set_missing_values (var, &mv);
585 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
586 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
588 /* Account for values.
589 Skip long string continuation records, if any. */
590 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
596 for (i = 1; i < nv; i++)
598 /* Check for record type 2 and width -1. */
599 if (read_int (r) != 2 || read_int (r) != -1)
600 sys_error (r, _("Missing string continuation record."));
602 /* Skip and ignore remaining continuation data. */
603 has_variable_label = read_int (r);
604 missing_value_code = read_int (r);
605 print_format = read_int (r);
606 write_format = read_int (r);
607 read_string (r, name, sizeof name);
609 /* Variable label fields on continuation records have
610 been spotted in system files created by "SPSS Power
611 Macintosh Release 6.1". */
612 if (has_variable_label)
613 skip_bytes (r, ROUND_UP (read_int (r), 4));
618 /* Translates the format spec from sysfile format to internal
621 parse_format_spec (struct sfm_reader *r, unsigned int s,
622 enum which_format which, struct variable *v,
623 int *format_warning_cnt)
625 const int max_format_warnings = 8;
627 uint8_t raw_type = s >> 16;
633 if (!fmt_from_io (raw_type, &f.type))
634 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
639 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
644 if (which == PRINT_FORMAT)
645 var_set_print_format (v, &f);
647 var_set_write_format (v, &f);
649 else if (*++format_warning_cnt <= max_format_warnings)
651 char fmt_string[FMT_STRING_LEN_MAX + 1];
652 sys_warn (r, _("%s variable %s has invalid %s format %s."),
653 var_is_numeric (v) ? _("Numeric") : _("String"),
655 which == PRINT_FORMAT ? _("print") : _("write"),
656 fmt_to_string (&f, fmt_string));
658 if (*format_warning_cnt == max_format_warnings)
659 sys_warn (r, _("Suppressing further invalid format warnings."));
663 /* Sets the weighting variable in DICT to the variable
664 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
667 setup_weight (struct sfm_reader *r, int weight_idx,
668 struct variable **var_by_value_idx, struct dictionary *dict)
672 struct variable *weight_var
673 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
674 if (var_is_numeric (weight_var))
675 dict_set_weight (dict, weight_var);
677 sys_error (r, _("Weighting variable must be numeric."));
681 /* Reads a document record, type 6, from system file R, and sets up
682 the documents and n_documents fields in the associated
685 read_documents (struct sfm_reader *r, struct dictionary *dict)
690 if (dict_get_documents (dict) != NULL)
691 sys_error (r, _("Multiple type 6 (document) records."));
693 line_cnt = read_int (r);
695 sys_error (r, _("Number of document lines (%d) "
696 "must be greater than 0."), line_cnt);
698 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
699 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
700 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
701 dict_set_documents (dict, documents);
703 sys_error (r, _("Document line contains null byte."));
704 pool_free (r->pool, documents);
707 /* Read a type 7 extension record. */
709 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
710 struct sfm_read_info *info)
712 int subtype = read_int (r);
713 size_t size = read_int (r);
714 size_t count = read_int (r);
715 size_t bytes = size * count;
717 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
718 allows an extra byte for a null terminator, used by some
719 extension processing routines. */
720 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
721 sys_error (r, "Record type 7 subtype %d too large.", subtype);
726 read_machine_integer_info (r, size, count, info);
730 read_machine_float_info (r, size, count);
734 /* Variable sets information. We don't use these yet.
735 They only apply to GUIs; see VARSETS on the APPLY
736 DICTIONARY command in SPSS documentation. */
740 /* DATE variable information. We don't use it yet, but we
745 /* Used by the MRSETS command. */
749 /* Used by the SPSS Data Entry software. */
753 read_display_parameters (r, size, count, dict);
757 read_long_var_name_map (r, size, count, dict);
761 read_long_string_map (r, size, count, dict);
765 /* New in SPSS v14? Unknown purpose. */
769 /* Text field that defines variable attributes. New in
774 /* New in SPSS 16. Contains a single string that describes
775 the character encoding, e.g. "windows-1252". */
779 /* New in SPSS 16. Encodes value labels for long string
781 sys_warn (r, _("Ignoring value labels for long string variables, "
782 "which PSPP does not yet support."));
786 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"),
787 subtype, PACKAGE_BUGREPORT);
791 skip_bytes (r, bytes);
794 /* Read record type 7, subtype 3. */
796 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
797 struct sfm_read_info *info)
799 int version_major = read_int (r);
800 int version_minor = read_int (r);
801 int version_revision = read_int (r);
802 int machine_code UNUSED = read_int (r);
803 int float_representation = read_int (r);
804 int compression_code UNUSED = read_int (r);
805 int integer_representation = read_int (r);
806 int character_code UNUSED = read_int (r);
808 int expected_float_format;
809 int expected_integer_format;
811 if (size != 4 || count != 8)
812 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
816 /* Save version info. */
817 info->version_major = version_major;
818 info->version_minor = version_minor;
819 info->version_revision = version_revision;
821 /* Check floating point format. */
822 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
823 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
824 expected_float_format = 1;
825 else if (r->float_format == FLOAT_Z_LONG)
826 expected_float_format = 2;
827 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
828 expected_float_format = 3;
831 if (float_representation != expected_float_format)
832 sys_error (r, _("Floating-point representation indicated by "
833 "system file (%d) differs from expected (%d)."),
834 r->float_format, expected_float_format);
836 /* Check integer format. */
837 if (r->integer_format == INTEGER_MSB_FIRST)
838 expected_integer_format = 1;
839 else if (r->integer_format == INTEGER_LSB_FIRST)
840 expected_integer_format = 2;
843 if (integer_representation != expected_integer_format)
845 static const char *const endian[] = {N_("little-endian"), N_("big-endian")};
846 sys_warn (r, _("Integer format indicated by system file (%s) "
847 "differs from expected (%s)."),
848 gettext (endian[integer_representation == 1]),
849 gettext (endian[expected_integer_format == 1]));
853 /* Read record type 7, subtype 4. */
855 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
857 double sysmis = read_float (r);
858 double highest = read_float (r);
859 double lowest = read_float (r);
861 if (size != 8 || count != 3)
862 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
865 if (sysmis != SYSMIS)
866 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
867 if (highest != HIGHEST)
868 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
869 if (lowest != LOWEST)
870 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
873 /* Read record type 7, subtype 11, which specifies how variables
874 should be displayed in GUI environments. */
876 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
877 struct dictionary *dict)
886 sys_warn (r, _("Bad size %zu on extension 11."), size);
887 skip_bytes (r, size * count);
891 n_vars = dict_get_var_cnt (dict);
892 if (count == 3 * n_vars)
893 includes_width = true;
894 else if (count == 2 * n_vars)
895 includes_width = false;
898 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
900 skip_bytes (r, size * count);
904 for (i = 0; i < n_vars; ++i)
906 struct variable *v = dict_get_var (dict, i);
907 int measure = read_int (r);
908 int width = includes_width ? read_int (r) : 0;
909 int align = read_int (r);
911 /* SPSS 14 sometimes seems to set string variables' measure
913 if (0 == measure && var_is_alpha (v))
916 if (measure < 1 || measure > 3 || align < 0 || align > 2)
919 sys_warn (r, _("Invalid variable display parameters "
920 "for variable %zu (%s). "
921 "Default parameters substituted."),
922 i, var_get_name (v));
927 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
928 : measure == 2 ? MEASURE_ORDINAL
930 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
931 : align == 1 ? ALIGN_RIGHT
934 /* Older versions (SPSS 9.0) sometimes set the display
935 width to zero. This causes confusion in the GUI, so
936 only set the width if it is nonzero. */
938 var_set_display_width (v, width);
942 /* Reads record type 7, subtype 13, which gives the long name
943 that corresponds to each short name. Modifies variable names
944 in DICT accordingly. */
946 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
947 struct dictionary *dict)
949 struct variable_to_value_map *map;
950 struct variable *var;
954 map = open_variable_to_value_map (r, size * count);
955 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
959 size_t short_name_cnt;
962 /* Validate long name. */
963 if (!var_is_valid_name (long_name, false))
965 sys_warn (r, _("Long variable mapping from %s to invalid "
966 "variable name `%s'."),
967 var_get_name (var), long_name);
971 /* Identify any duplicates. */
972 if (strcasecmp (var_get_short_name (var, 0), long_name)
973 && dict_lookup_var (dict, long_name) != NULL)
975 sys_warn (r, _("Duplicate long variable name `%s' "
976 "within system file."), long_name);
980 /* Renaming a variable may clear its short names, but we
981 want to retain them, so we save them and re-set them
983 short_name_cnt = var_get_short_name_cnt (var);
984 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
985 for (i = 0; i < short_name_cnt; i++)
987 const char *s = var_get_short_name (var, i);
988 short_names[i] = s != NULL ? xstrdup (s) : NULL;
992 dict_rename_var (dict, var, long_name);
994 /* Restore short names. */
995 for (i = 0; i < short_name_cnt; i++)
997 var_set_short_name (var, i, short_names[i]);
998 free (short_names[i]);
1002 close_variable_to_value_map (r, map);
1003 r->has_long_var_names = true;
1006 /* Reads record type 7, subtype 14, which gives the real length
1007 of each very long string. Rearranges DICT accordingly. */
1009 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1010 struct dictionary *dict)
1012 struct variable_to_value_map *map;
1013 struct variable *var;
1015 int warning_cnt = 0;
1017 map = open_variable_to_value_map (r, size * count);
1018 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
1021 size_t idx = var_get_dict_index (var);
1027 length = strtol (length_s, NULL, 10);
1028 if (length < 1 || length > MAX_STRING)
1030 sys_warn (r, _("%s listed as string of invalid length %s "
1031 "in very length string record."),
1032 var_get_name (var), length_s);
1036 /* Check segments. */
1037 segment_cnt = sfm_width_to_segments (length);
1038 if (segment_cnt == 1)
1040 sys_warn (r, _("%s listed in very long string record with width %s, "
1041 "which requires only one segment."),
1042 var_get_name (var), length_s);
1045 if (idx + segment_cnt > dict_get_var_cnt (dict))
1046 sys_error (r, _("Very long string %s overflows dictionary."),
1047 var_get_name (var));
1049 /* Get the short names from the segments and check their
1051 for (i = 0; i < segment_cnt; i++)
1053 struct variable *seg = dict_get_var (dict, idx + i);
1054 int alloc_width = sfm_segment_alloc_width (length, i);
1055 int width = var_get_width (seg);
1058 var_set_short_name (var, i, var_get_short_name (seg, 0));
1059 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1060 sys_error (r, _("Very long string with width %ld has segment %d "
1061 "of width %d (expected %d)"),
1062 length, i, width, alloc_width);
1064 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1065 var_set_width (var, length);
1067 close_variable_to_value_map (r, map);
1068 dict_compact_values (dict);
1071 /* Reads value labels from sysfile H and inserts them into the
1072 associated dictionary. */
1074 read_value_labels (struct sfm_reader *r,
1075 struct dictionary *dict, struct variable **var_by_value_idx)
1077 struct pool *subpool;
1081 char raw_value[8]; /* Value as uninterpreted bytes. */
1082 union value value; /* Value. */
1083 char *label; /* Null-terminated label string. */
1086 struct label *labels = NULL;
1087 int label_cnt; /* Number of labels. */
1089 struct variable **var = NULL; /* Associated variables. */
1090 int var_cnt; /* Number of associated variables. */
1094 subpool = pool_create_subpool (r->pool);
1096 /* Read the type 3 record and record its contents. We can't do
1097 much with the data yet because we don't know whether it is
1098 of numeric or string type. */
1100 /* Read number of labels. */
1101 label_cnt = read_int (r);
1103 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1105 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1110 /* Read each value/label tuple into labels[]. */
1111 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1112 for (i = 0; i < label_cnt; i++)
1114 struct label *label = labels + i;
1115 unsigned char label_len;
1119 read_bytes (r, label->raw_value, sizeof label->raw_value);
1121 /* Read label length. */
1122 read_bytes (r, &label_len, sizeof label_len);
1123 padded_len = ROUND_UP (label_len + 1, 8);
1125 /* Read label, padding. */
1126 label->label = pool_alloc (subpool, padded_len + 1);
1127 read_bytes (r, label->label, padded_len - 1);
1128 label->label[label_len] = 0;
1131 /* Now, read the type 4 record that has the list of variables
1132 to which the value labels are to be applied. */
1134 /* Read record type of type 4 record. */
1135 if (read_int (r) != 4)
1136 sys_error (r, _("Variable index record (type 4) does not immediately "
1137 "follow value label record (type 3) as it should."));
1139 /* Read number of variables associated with value label from type 4
1141 var_cnt = read_int (r);
1142 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1143 sys_error (r, _("Number of variables associated with a value label (%d) "
1144 "is not between 1 and the number of variables (%zu)."),
1145 var_cnt, dict_get_var_cnt (dict));
1147 /* Read the list of variables. */
1148 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1149 for (i = 0; i < var_cnt; i++)
1151 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1152 if (var_is_long_string (var[i]))
1153 sys_error (r, _("Value labels are not allowed on long string "
1154 "variables (%s)."), var_get_name (var[i]));
1157 /* Type check the variables. */
1158 for (i = 1; i < var_cnt; i++)
1159 if (var_get_type (var[i]) != var_get_type (var[0]))
1160 sys_error (r, _("Variables associated with value label are not all of "
1161 "identical type. Variable %s is %s, but variable "
1163 var_get_name (var[0]),
1164 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1165 var_get_name (var[i]),
1166 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1168 /* Fill in labels[].value, now that we know the desired type. */
1169 for (i = 0; i < label_cnt; i++)
1171 struct label *label = labels + i;
1173 if (var_is_alpha (var[0]))
1174 buf_copy_rpad (label->value.s, sizeof label->value.s,
1175 label->raw_value, sizeof label->raw_value);
1177 label->value.f = float_get_double (r->float_format, label->raw_value);
1180 /* Assign the `value_label's to each variable. */
1181 for (i = 0; i < var_cnt; i++)
1183 struct variable *v = var[i];
1186 /* Add each label to the variable. */
1187 for (j = 0; j < label_cnt; j++)
1189 struct label *label = &labels[j];
1190 if (!var_add_value_label (v, &label->value, label->label))
1192 if (var_is_numeric (var[0]))
1193 sys_warn (r, _("Duplicate value label for %g on %s."),
1194 label->value.f, var_get_name (v));
1196 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1197 var_get_width (v), label->value.s,
1203 pool_destroy (subpool);
1208 static void partial_record (struct sfm_reader *r)
1211 static void read_error (struct casereader *, const struct sfm_reader *);
1213 static bool read_case_number (struct sfm_reader *, double *);
1214 static bool read_case_string (struct sfm_reader *, char *, size_t);
1215 static int read_opcode (struct sfm_reader *);
1216 static bool read_compressed_number (struct sfm_reader *, double *);
1217 static bool read_compressed_string (struct sfm_reader *, char *);
1218 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1219 static bool skip_whole_strings (struct sfm_reader *, size_t);
1221 /* Reads one case from READER's file into C. Returns true only
1224 sys_file_casereader_read (struct casereader *reader, void *r_,
1227 struct sfm_reader *r = r_;
1233 case_create (c, r->value_cnt);
1234 if (setjmp (r->bail_out))
1236 casereader_force_error (reader);
1241 for (i = 0; i < r->sfm_var_cnt; i++)
1243 struct sfm_var *sv = &r->sfm_vars[i];
1244 union value *v = case_data_rw_idx (c, sv->case_index);
1248 if (!read_case_number (r, &v->f))
1253 if (!read_case_string (r, v->s + sv->offset, sv->width))
1255 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1265 if (r->case_cnt != -1)
1266 read_error (reader, r);
1270 /* Issues an error that R ends in a partial record. */
1272 partial_record (struct sfm_reader *r)
1274 sys_error (r, _("File ends in partial case."));
1277 /* Issues an error that an unspecified error occurred SFM, and
1280 read_error (struct casereader *r, const struct sfm_reader *sfm)
1282 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1283 casereader_force_error (r);
1286 /* Reads a number from R and stores its value in *D.
1287 If R is compressed, reads a compressed number;
1288 otherwise, reads a number in the regular way.
1289 Returns true if successful, false if end of file is
1290 reached immediately. */
1292 read_case_number (struct sfm_reader *r, double *d)
1297 if (!try_read_bytes (r, number, sizeof number))
1299 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1303 return read_compressed_number (r, d);
1306 /* Reads LENGTH string bytes from R into S.
1307 Always reads a multiple of 8 bytes; if LENGTH is not a
1308 multiple of 8, then extra bytes are read and discarded without
1310 Reads compressed strings if S is compressed.
1311 Returns true if successful, false if end of file is
1312 reached immediately. */
1314 read_case_string (struct sfm_reader *r, char *s, size_t length)
1316 size_t whole = ROUND_DOWN (length, 8);
1317 size_t partial = length % 8;
1321 if (!read_whole_strings (r, s, whole))
1328 if (!read_whole_strings (r, bounce, sizeof bounce))
1334 memcpy (s + whole, bounce, partial);
1340 /* Reads and returns the next compression opcode from R. */
1342 read_opcode (struct sfm_reader *r)
1344 assert (r->compressed);
1348 if (r->opcode_idx >= sizeof r->opcodes)
1350 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1354 opcode = r->opcodes[r->opcode_idx++];
1361 /* Reads a compressed number from R and stores its value in D.
1362 Returns true if successful, false if end of file is
1363 reached immediately. */
1365 read_compressed_number (struct sfm_reader *r, double *d)
1367 int opcode = read_opcode (r);
1375 *d = read_float (r);
1379 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
1380 if (!r->corruption_warning)
1382 r->corruption_warning = true;
1383 sys_warn (r, _("Possible compressed data corruption: "
1384 "compressed spaces appear in numeric field."));
1393 *d = opcode - r->bias;
1400 /* Reads a compressed 8-byte string segment from R and stores it
1402 Returns true if successful, false if end of file is
1403 reached immediately. */
1405 read_compressed_string (struct sfm_reader *r, char *dst)
1407 int opcode = read_opcode (r);
1415 read_bytes (r, dst, 8);
1419 memset (dst, ' ', 8);
1424 double value = opcode - r->bias;
1425 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
1428 /* This has actually been seen "in the wild". The submitter of the
1429 file that showed that the contents decoded as spaces, but they
1430 were at the end of the field so it's possible that the null
1431 bytes just acted as null terminators. */
1433 else if (!r->corruption_warning)
1435 r->corruption_warning = true;
1436 sys_warn (r, _("Possible compressed data corruption: "
1437 "string contains compressed integer (opcode %d)"),
1447 /* Reads LENGTH string bytes from R into S.
1448 LENGTH must be a multiple of 8.
1449 Reads compressed strings if S is compressed.
1450 Returns true if successful, false if end of file is
1451 reached immediately. */
1453 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1455 assert (length % 8 == 0);
1457 return try_read_bytes (r, s, length);
1461 for (ofs = 0; ofs < length; ofs += 8)
1462 if (!read_compressed_string (r, s + ofs))
1472 /* Skips LENGTH string bytes from R.
1473 LENGTH must be a multiple of 8.
1474 (LENGTH is also limited to 1024, but that's only because the
1475 current caller never needs more than that many bytes.)
1476 Returns true if successful, false if end of file is
1477 reached immediately. */
1479 skip_whole_strings (struct sfm_reader *r, size_t length)
1482 assert (length < sizeof buffer);
1483 return read_whole_strings (r, buffer, length);
1486 /* Creates and returns a table that can be used for translating a value
1487 index into a case to a "struct variable *" for DICT. Multiple
1488 system file fields reference variables this way.
1490 This table must be created before processing the very long
1491 string extension record, because that record causes some
1492 values to be deleted from the case and the dictionary to be
1494 static struct variable **
1495 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1497 struct variable **var_by_value_idx;
1501 var_by_value_idx = pool_nmalloc (r->pool,
1502 r->oct_cnt, sizeof *var_by_value_idx);
1503 for (i = 0; i < dict_get_var_cnt (dict); i++)
1505 struct variable *v = dict_get_var (dict, i);
1506 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1509 var_by_value_idx[value_idx++] = v;
1510 for (j = 1; j < nv; j++)
1511 var_by_value_idx[value_idx++] = NULL;
1513 assert (value_idx == r->oct_cnt);
1515 return var_by_value_idx;
1518 /* Returns the "struct variable" corresponding to the given
1519 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1521 static struct variable *
1522 lookup_var_by_value_idx (struct sfm_reader *r,
1523 struct variable **var_by_value_idx, int value_idx)
1525 struct variable *var;
1527 if (value_idx < 1 || value_idx > r->oct_cnt)
1528 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1529 value_idx, r->oct_cnt);
1531 var = var_by_value_idx[value_idx - 1];
1533 sys_error (r, _("Variable index %d refers to long string "
1540 /* Returns the variable in D with the given SHORT_NAME,
1541 or a null pointer if there is none. */
1542 static struct variable *
1543 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1545 struct variable *var;
1549 /* First try looking up by full name. This often succeeds. */
1550 var = dict_lookup_var (d, short_name);
1551 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
1554 /* Iterate through the whole dictionary as a fallback. */
1555 var_cnt = dict_get_var_cnt (d);
1556 for (i = 0; i < var_cnt; i++)
1558 var = dict_get_var (d, i);
1559 if (!strcasecmp (var_get_short_name (var, 0), short_name))
1566 /* Helpers for reading records that contain "variable=value"
1570 struct variable_to_value_map
1572 struct substring buffer; /* Record contents. */
1573 size_t pos; /* Current position in buffer. */
1576 /* Reads SIZE bytes into a "variable=value" map for R,
1577 and returns the map. */
1578 static struct variable_to_value_map *
1579 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1581 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1582 char *buffer = pool_malloc (r->pool, size + 1);
1583 read_bytes (r, buffer, size);
1584 map->buffer = ss_buffer (buffer, size);
1589 /* Closes MAP and frees its storage.
1590 Not really needed, because the pool will free the map anyway,
1591 but can be used to free it earlier. */
1593 close_variable_to_value_map (struct sfm_reader *r,
1594 struct variable_to_value_map *map)
1596 pool_free (r->pool, ss_data (map->buffer));
1599 /* Reads the next variable=value pair from MAP.
1600 Looks up the variable in DICT and stores it into *VAR.
1601 Stores a null-terminated value into *VALUE. */
1603 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1604 struct variable_to_value_map *map,
1605 struct variable **var, char **value,
1608 int max_warnings = 5;
1612 struct substring short_name_ss, value_ss;
1614 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1615 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1618 if (*warning_cnt > max_warnings)
1619 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1620 *warning_cnt - max_warnings);
1624 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1625 ss_buffer ("\t\0", 2));
1627 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1628 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1631 if (++*warning_cnt <= max_warnings)
1632 sys_warn (r, _("Variable map refers to unknown variable %s."),
1633 ss_data (short_name_ss));
1637 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1638 *value = ss_data (value_ss);
1646 /* Displays a corruption message. */
1648 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1653 ds_init_empty (&text);
1654 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1655 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1656 ds_put_vformat (&text, format, args);
1658 m.category = msg_class_to_category (class);
1659 m.severity = msg_class_to_severity (class);
1660 m.where.file_name = NULL;
1661 m.where.line_number = 0;
1662 m.text = ds_cstr (&text);
1667 /* Displays a warning for the current file position. */
1669 sys_warn (struct sfm_reader *r, const char *format, ...)
1673 va_start (args, format);
1674 sys_msg (r, MW, format, args);
1678 /* Displays an error for the current file position,
1679 marks it as in an error state,
1680 and aborts reading it using longjmp. */
1682 sys_error (struct sfm_reader *r, const char *format, ...)
1686 va_start (args, format);
1687 sys_msg (r, ME, format, args);
1691 longjmp (r->bail_out, 1);
1694 /* Reads BYTE_CNT bytes into BUF.
1695 Returns true if exactly BYTE_CNT bytes are successfully read.
1696 Aborts if an I/O error or a partial read occurs.
1697 If EOF_IS_OK, then an immediate end-of-file causes false to be
1698 returned; otherwise, immediate end-of-file causes an abort
1701 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1702 void *buf, size_t byte_cnt)
1704 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1705 if (bytes_read == byte_cnt)
1707 else if (ferror (r->file))
1708 sys_error (r, _("System error: %s."), strerror (errno));
1709 else if (!eof_is_ok || bytes_read != 0)
1710 sys_error (r, _("Unexpected end of file."));
1715 /* Reads BYTE_CNT into BUF.
1716 Aborts upon I/O error or if end-of-file is encountered. */
1718 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1720 read_bytes_internal (r, false, buf, byte_cnt);
1723 /* Reads BYTE_CNT bytes into BUF.
1724 Returns true if exactly BYTE_CNT bytes are successfully read.
1725 Returns false if an immediate end-of-file is encountered.
1726 Aborts if an I/O error or a partial read occurs. */
1728 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1730 return read_bytes_internal (r, true, buf, byte_cnt);
1733 /* Reads a 32-bit signed integer from R and returns its value in
1736 read_int (struct sfm_reader *r)
1739 read_bytes (r, integer, sizeof integer);
1740 return integer_get (r->integer_format, integer, sizeof integer);
1743 /* Reads a 64-bit floating-point number from R and returns its
1744 value in host format. */
1746 read_float (struct sfm_reader *r)
1749 read_bytes (r, number, sizeof number);
1750 return float_get_double (r->float_format, number);
1753 /* Reads exactly SIZE - 1 bytes into BUFFER
1754 and stores a null byte into BUFFER[SIZE - 1]. */
1756 read_string (struct sfm_reader *r, char *buffer, size_t size)
1759 read_bytes (r, buffer, size - 1);
1760 buffer[size - 1] = '\0';
1763 /* Skips BYTES bytes forward in R. */
1765 skip_bytes (struct sfm_reader *r, size_t bytes)
1770 size_t chunk = MIN (sizeof buffer, bytes);
1771 read_bytes (r, buffer, chunk);
1776 static const struct casereader_class sys_file_casereader_class =
1778 sys_file_casereader_read,
1779 sys_file_casereader_destroy,