1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/assertion.h>
29 #include <libpspp/message.h>
30 #include <libpspp/compiler.h>
31 #include <libpspp/misc.h>
32 #include <libpspp/pool.h>
33 #include <libpspp/str.h>
34 #include <libpspp/hash.h>
35 #include <libpspp/array.h>
37 #include <data/case.h>
38 #include <data/casereader-provider.h>
39 #include <data/casereader.h>
40 #include <data/dictionary.h>
41 #include <data/file-handle-def.h>
42 #include <data/file-name.h>
43 #include <data/format.h>
44 #include <data/missing-values.h>
45 #include <data/short-names.h>
46 #include <data/value-labels.h>
47 #include <data/variable.h>
48 #include <data/value.h>
53 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 struct fh_lock *lock; /* Mutual exclusion for file handle. */
71 FILE *file; /* File stream. */
72 bool error; /* I/O or corruption error? */
73 size_t value_cnt; /* Number of "union value"s in struct case. */
76 enum integer_format integer_format; /* On-disk integer format. */
77 enum float_format float_format; /* On-disk floating point format. */
78 int oct_cnt; /* Number of 8-byte units per case. */
79 struct sfm_var *sfm_vars; /* Variables. */
80 size_t sfm_var_cnt; /* Number of variables. */
81 casenumber case_cnt; /* Number of cases */
82 bool has_long_var_names; /* File has a long variable name map */
85 bool compressed; /* File is compressed? */
86 double bias; /* Compression bias, usually 100.0. */
87 uint8_t opcodes[8]; /* Current block of opcodes. */
88 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 static const struct casereader_class sys_file_casereader_class;
93 static bool close_reader (struct sfm_reader *);
95 static struct variable **make_var_by_value_idx (struct sfm_reader *,
97 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
101 static void sys_warn (struct sfm_reader *, const char *, ...)
102 PRINTF_FORMAT (2, 3);
104 static void sys_error (struct sfm_reader *, const char *, ...)
108 static void read_bytes (struct sfm_reader *, void *, size_t);
109 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
110 static int read_int (struct sfm_reader *);
111 static double read_float (struct sfm_reader *);
112 static void read_string (struct sfm_reader *, char *, size_t);
113 static void skip_bytes (struct sfm_reader *, size_t);
115 static struct variable_to_value_map *open_variable_to_value_map (
116 struct sfm_reader *, size_t size);
117 static void close_variable_to_value_map (struct sfm_reader *r,
118 struct variable_to_value_map *);
119 static bool read_variable_to_value_map (struct sfm_reader *,
121 struct variable_to_value_map *,
122 struct variable **var, char **value,
125 static bool close_reader (struct sfm_reader *r);
127 /* Dictionary reader. */
135 static void read_header (struct sfm_reader *, struct dictionary *,
136 int *weight_idx, int *claimed_oct_cnt,
137 struct sfm_read_info *);
138 static void read_variable_record (struct sfm_reader *, struct dictionary *,
139 int *format_warning_cnt);
140 static void parse_format_spec (struct sfm_reader *, unsigned int,
141 enum which_format, struct variable *,
142 int *format_warning_cnt);
143 static void setup_weight (struct sfm_reader *, int weight_idx,
144 struct variable **var_by_value_idx,
145 struct dictionary *);
146 static void read_documents (struct sfm_reader *, struct dictionary *);
147 static void read_value_labels (struct sfm_reader *, struct dictionary *,
148 struct variable **var_by_value_idx);
150 static void read_extension_record (struct sfm_reader *, struct dictionary *,
151 struct sfm_read_info *);
152 static void read_machine_integer_info (struct sfm_reader *,
153 size_t size, size_t count,
154 struct sfm_read_info *);
155 static void read_machine_float_info (struct sfm_reader *,
156 size_t size, size_t count);
157 static void read_display_parameters (struct sfm_reader *,
158 size_t size, size_t count,
159 struct dictionary *);
160 static void read_long_var_name_map (struct sfm_reader *,
161 size_t size, size_t count,
162 struct dictionary *);
163 static void read_long_string_map (struct sfm_reader *,
164 size_t size, size_t count,
165 struct dictionary *);
168 /* Opens the system file designated by file handle FH for
169 reading. Reads the system file's dictionary into *DICT.
170 If INFO is non-null, then it receives additional info about the
173 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
174 struct sfm_read_info *volatile info)
176 struct sfm_reader *volatile r = NULL;
177 struct variable **var_by_value_idx;
178 struct sfm_read_info local_info;
179 int format_warning_cnt = 0;
184 *dict = dict_create ();
186 /* Create and initialize reader. */
187 r = pool_create_container (struct sfm_reader, pool);
193 r->has_long_var_names = false;
194 r->opcode_idx = sizeof r->opcodes;
196 /* TRANSLATORS: this fragment will be interpolated into
197 messages in fh_lock() that identify types of files. */
198 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
202 r->file = fn_open (fh_get_file_name (fh), "rb");
205 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
206 fh_get_file_name (r->fh), strerror (errno));
210 /* Initialize info. */
213 memset (info, 0, sizeof *info);
215 if (setjmp (r->bail_out))
220 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
222 /* Read all the variable definition records. */
223 rec_type = read_int (r);
224 while (rec_type == 2)
226 read_variable_record (r, *dict, &format_warning_cnt);
227 rec_type = read_int (r);
230 /* Figure out the case format. */
231 var_by_value_idx = make_var_by_value_idx (r, *dict);
232 setup_weight (r, weight_idx, var_by_value_idx, *dict);
234 /* Read all the rest of the dictionary records. */
235 while (rec_type != 999)
240 read_value_labels (r, *dict, var_by_value_idx);
244 sys_error (r, _("Misplaced type 4 record."));
247 read_documents (r, *dict);
251 read_extension_record (r, *dict, info);
255 sys_error (r, _("Unrecognized record type %d."), rec_type);
257 rec_type = read_int (r);
261 if ( ! r->has_long_var_names )
264 for (i = 0; i < dict_get_var_cnt (*dict); i++)
266 struct variable *var = dict_get_var (*dict, i);
267 char short_name[SHORT_NAME_LEN + 1];
268 char long_name[SHORT_NAME_LEN + 1];
270 strcpy (short_name, var_get_name (var));
272 strcpy (long_name, short_name);
273 str_lowercase (long_name);
275 /* Set long name. Renaming a variable may clear the short
276 name, but we want to retain it, so re-set it
278 dict_rename_var (*dict, var, long_name);
279 var_set_short_name (var, 0, short_name);
282 r->has_long_var_names = true;
285 /* Read record 999 data, which is just filler. */
288 /* Warn if the actual amount of data per case differs from the
289 amount that the header claims. SPSS version 13 gets this
290 wrong when very long strings are involved, so don't warn in
292 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
293 && info->version_major != 13)
294 sys_warn (r, _("File header claims %d variable positions but "
295 "%d were read from file."),
296 claimed_oct_cnt, r->oct_cnt);
298 /* Create an index of dictionary variable widths for
299 sfm_read_case to use. We cannot use the `struct variable's
300 from the dictionary we created, because the caller owns the
301 dictionary and may destroy or modify its variables. */
302 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
303 pool_register (r->pool, free, r->sfm_vars);
305 pool_free (r->pool, var_by_value_idx);
306 r->value_cnt = dict_get_next_value_idx (*dict);
307 return casereader_create_sequential
309 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
310 &sys_file_casereader_class, r);
314 dict_destroy (*dict);
319 /* Closes a system file after we're done with it.
320 Returns true if an I/O error has occurred on READER, false
323 close_reader (struct sfm_reader *r)
332 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
334 msg (ME, _("Error closing system file \"%s\": %s."),
335 fh_get_file_name (r->fh), strerror (errno));
345 pool_destroy (r->pool);
350 /* Destroys READER. */
352 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
354 struct sfm_reader *r = r_;
358 /* Returns true if FILE is an SPSS system file,
361 sfm_detect (FILE *file)
365 if (fread (rec_type, 4, 1, file) != 1)
369 return !strcmp ("$FL2", rec_type);
372 /* Reads the global header of the system file.
373 Sets DICT's file label to the system file's label.
374 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
375 or to the value index of the weight variable otherwise.
376 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
377 per case that the file claims to have (although it is not
379 Initializes INFO with header information. */
381 read_header (struct sfm_reader *r, struct dictionary *dict,
382 int *weight_idx, int *claimed_oct_cnt,
383 struct sfm_read_info *info)
386 char eye_catcher[61];
387 uint8_t raw_layout_code[4];
389 char creation_date[10];
390 char creation_time[9];
392 struct substring file_label_ss;
393 struct substring product;
395 read_string (r, rec_type, sizeof rec_type);
396 read_string (r, eye_catcher, sizeof eye_catcher);
398 if (strcmp ("$FL2", rec_type) != 0)
399 sys_error (r, _("This is not an SPSS system file."));
401 /* Identify integer format. */
402 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
403 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
405 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
407 || (r->integer_format != INTEGER_MSB_FIRST
408 && r->integer_format != INTEGER_LSB_FIRST))
409 sys_error (r, _("This is not an SPSS system file."));
411 *claimed_oct_cnt = read_int (r);
412 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
413 *claimed_oct_cnt = -1;
415 r->compressed = read_int (r) != 0;
417 *weight_idx = read_int (r);
419 r->case_cnt = read_int (r);
420 if ( r->case_cnt > INT_MAX / 2)
424 /* Identify floating-point format and obtain compression bias. */
425 read_bytes (r, raw_bias, sizeof raw_bias);
426 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
428 sys_warn (r, _("Compression bias (%g) is not the usual "
429 "value of 100, or system file uses unrecognized "
430 "floating-point format."),
432 if (r->integer_format == INTEGER_MSB_FIRST)
433 r->float_format = FLOAT_IEEE_DOUBLE_BE;
435 r->float_format = FLOAT_IEEE_DOUBLE_LE;
437 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
439 read_string (r, creation_date, sizeof creation_date);
440 read_string (r, creation_time, sizeof creation_time);
441 read_string (r, file_label, sizeof file_label);
444 file_label_ss = ss_cstr (file_label);
445 ss_trim (&file_label_ss, ss_cstr (" "));
446 if (!ss_is_empty (file_label_ss))
448 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
449 dict_set_label (dict, ss_data (file_label_ss));
452 strcpy (info->creation_date, creation_date);
453 strcpy (info->creation_time, creation_time);
454 info->integer_format = r->integer_format;
455 info->float_format = r->float_format;
456 info->compressed = r->compressed;
457 info->case_cnt = r->case_cnt;
459 product = ss_cstr (eye_catcher);
460 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
461 ss_trim (&product, ss_cstr (" "));
462 str_copy_buf_trunc (info->product, sizeof info->product,
463 ss_data (product), ss_length (product));
466 /* Reads a variable (type 2) record from R and adds the
467 corresponding variable to DICT.
468 Also skips past additional variable records for long string
471 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
472 int *format_warning_cnt)
475 int has_variable_label;
476 int missing_value_code;
481 struct variable *var;
484 width = read_int (r);
485 has_variable_label = read_int (r);
486 missing_value_code = read_int (r);
487 print_format = read_int (r);
488 write_format = read_int (r);
489 read_string (r, name, sizeof name);
490 name[strcspn (name, " ")] = '\0';
492 /* Check variable name. */
493 if (name[0] == '$' || name[0] == '#')
494 sys_error (r, "Variable name begins with invalid character `%c'.",
496 if (!var_is_plausible_name (name, false))
497 sys_error (r, _("Invalid variable name `%s'."), name);
499 /* Create variable. */
500 if (width < 0 || width > 255)
501 sys_error (r, _("Bad variable width %d."), width);
502 var = dict_create_var (dict, name, width);
505 _("Duplicate variable name `%s' within system file."),
508 /* Set the short name the same as the long name. */
509 var_set_short_name (var, 0, var_get_name (var));
511 /* Get variable label, if any. */
512 if (has_variable_label != 0 && has_variable_label != 1)
513 sys_error (r, _("Variable label indicator field is not 0 or 1."));
514 if (has_variable_label == 1)
520 if (len >= sizeof label)
521 sys_error (r, _("Variable %s has label of invalid length %zu."),
523 read_string (r, label, len + 1);
524 var_set_label (var, label);
526 skip_bytes (r, ROUND_UP (len, 4) - len);
529 /* Set missing values. */
530 if (missing_value_code != 0)
532 struct missing_values mv;
535 mv_init (&mv, var_get_width (var));
536 if (var_is_numeric (var))
538 if (missing_value_code < -3 || missing_value_code > 3
539 || missing_value_code == -1)
540 sys_error (r, _("Numeric missing value indicator field is not "
541 "-3, -2, 0, 1, 2, or 3."));
542 if (missing_value_code < 0)
544 double low = read_float (r);
545 double high = read_float (r);
546 mv_add_range (&mv, low, high);
547 missing_value_code = -missing_value_code - 2;
549 for (i = 0; i < missing_value_code; i++)
550 mv_add_num (&mv, read_float (r));
554 if (missing_value_code < 1 || missing_value_code > 3)
555 sys_error (r, _("String missing value indicator field is not "
557 if (var_is_long_string (var))
558 sys_warn (r, _("Ignoring missing values on long string variable "
559 "%s, which PSPP does not yet support."), name);
560 for (i = 0; i < missing_value_code; i++)
563 read_string (r, string, sizeof string);
564 mv_add_str (&mv, string);
567 if (!var_is_long_string (var))
568 var_set_missing_values (var, &mv);
572 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
573 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
575 /* Account for values.
576 Skip long string continuation records, if any. */
577 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
583 for (i = 1; i < nv; i++)
585 /* Check for record type 2 and width -1. */
586 if (read_int (r) != 2 || read_int (r) != -1)
587 sys_error (r, _("Missing string continuation record."));
589 /* Skip and ignore remaining continuation data. */
590 has_variable_label = read_int (r);
591 missing_value_code = read_int (r);
592 print_format = read_int (r);
593 write_format = read_int (r);
594 read_string (r, name, sizeof name);
596 /* Variable label fields on continuation records have
597 been spotted in system files created by "SPSS Power
598 Macintosh Release 6.1". */
599 if (has_variable_label)
600 skip_bytes (r, ROUND_UP (read_int (r), 4));
605 /* Translates the format spec from sysfile format to internal
608 parse_format_spec (struct sfm_reader *r, unsigned int s,
609 enum which_format which, struct variable *v,
610 int *format_warning_cnt)
612 const int max_format_warnings = 8;
614 uint8_t raw_type = s >> 16;
620 if (!fmt_from_io (raw_type, &f.type))
621 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
626 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
631 if (which == PRINT_FORMAT)
632 var_set_print_format (v, &f);
634 var_set_write_format (v, &f);
636 else if (*++format_warning_cnt <= max_format_warnings)
638 char fmt_string[FMT_STRING_LEN_MAX + 1];
639 sys_warn (r, _("%s variable %s has invalid %s format %s."),
640 var_is_numeric (v) ? _("Numeric") : _("String"),
642 which == PRINT_FORMAT ? _("print") : _("write"),
643 fmt_to_string (&f, fmt_string));
645 if (*format_warning_cnt == max_format_warnings)
646 sys_warn (r, _("Suppressing further invalid format warnings."));
650 /* Sets the weighting variable in DICT to the variable
651 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
654 setup_weight (struct sfm_reader *r, int weight_idx,
655 struct variable **var_by_value_idx, struct dictionary *dict)
659 struct variable *weight_var
660 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
661 if (var_is_numeric (weight_var))
662 dict_set_weight (dict, weight_var);
664 sys_error (r, _("Weighting variable must be numeric."));
668 /* Reads a document record, type 6, from system file R, and sets up
669 the documents and n_documents fields in the associated
672 read_documents (struct sfm_reader *r, struct dictionary *dict)
677 if (dict_get_documents (dict) != NULL)
678 sys_error (r, _("Multiple type 6 (document) records."));
680 line_cnt = read_int (r);
682 sys_error (r, _("Number of document lines (%d) "
683 "must be greater than 0."), line_cnt);
685 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
686 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
687 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
688 dict_set_documents (dict, documents);
690 sys_error (r, _("Document line contains null byte."));
691 pool_free (r->pool, documents);
694 /* Read a type 7 extension record. */
696 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
697 struct sfm_read_info *info)
699 int subtype = read_int (r);
700 size_t size = read_int (r);
701 size_t count = read_int (r);
702 size_t bytes = size * count;
704 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
705 allows an extra byte for a null terminator, used by some
706 extension processing routines. */
707 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
708 sys_error (r, "Record type 7 subtype %d too large.", subtype);
713 read_machine_integer_info (r, size, count, info);
717 read_machine_float_info (r, size, count);
721 /* Variable sets information. We don't use these yet.
722 They only apply to GUIs; see VARSETS on the APPLY
723 DICTIONARY command in SPSS documentation. */
727 /* DATE variable information. We don't use it yet, but we
732 /* Unknown purpose. */
736 read_display_parameters (r, size, count, dict);
740 read_long_var_name_map (r, size, count, dict);
744 read_long_string_map (r, size, count, dict);
748 /* New in SPSS v14? Unknown purpose. */
752 /* Text field that defines variable attributes. New in
757 /* New in SPSS 16. Contains a single string that describes
758 the character encoding, e.g. "windows-1252". */
762 /* New in SPSS 16. Encodes value labels for long string
764 sys_warn (r, _("Ignoring value labels for long string variables, "
765 "which PSPP does not yet support."));
769 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
773 skip_bytes (r, bytes);
776 /* Read record type 7, subtype 3. */
778 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
779 struct sfm_read_info *info)
781 int version_major = read_int (r);
782 int version_minor = read_int (r);
783 int version_revision = read_int (r);
784 int machine_code UNUSED = read_int (r);
785 int float_representation = read_int (r);
786 int compression_code UNUSED = read_int (r);
787 int integer_representation = read_int (r);
788 int character_code UNUSED = read_int (r);
790 int expected_float_format;
791 int expected_integer_format;
793 if (size != 4 || count != 8)
794 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
798 /* Save version info. */
799 info->version_major = version_major;
800 info->version_minor = version_minor;
801 info->version_revision = version_revision;
803 /* Check floating point format. */
804 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
805 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
806 expected_float_format = 1;
807 else if (r->float_format == FLOAT_Z_LONG)
808 expected_float_format = 2;
809 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
810 expected_float_format = 3;
813 if (float_representation != expected_float_format)
814 sys_error (r, _("Floating-point representation indicated by "
815 "system file (%d) differs from expected (%d)."),
816 r->float_format, expected_float_format);
818 /* Check integer format. */
819 if (r->integer_format == INTEGER_MSB_FIRST)
820 expected_integer_format = 1;
821 else if (r->integer_format == INTEGER_LSB_FIRST)
822 expected_integer_format = 2;
825 if (integer_representation != expected_integer_format)
827 static const char *const endian[] = {N_("little-endian"), N_("big-endian")};
828 sys_warn (r, _("Integer format indicated by system file (%s) "
829 "differs from expected (%s)."),
830 gettext (endian[integer_representation == 1]),
831 gettext (endian[expected_integer_format == 1]));
835 /* Read record type 7, subtype 4. */
837 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
839 double sysmis = read_float (r);
840 double highest = read_float (r);
841 double lowest = read_float (r);
843 if (size != 8 || count != 3)
844 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
847 if (sysmis != SYSMIS)
848 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
849 if (highest != HIGHEST)
850 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
851 if (lowest != LOWEST)
852 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
855 /* Read record type 7, subtype 11, which specifies how variables
856 should be displayed in GUI environments. */
858 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
859 struct dictionary *dict)
868 sys_warn (r, _("Bad size %zu on extension 11."), size);
869 skip_bytes (r, size * count);
873 n_vars = dict_get_var_cnt (dict);
874 if (count == 3 * n_vars)
875 includes_width = true;
876 else if (count == 2 * n_vars)
877 includes_width = false;
880 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
882 skip_bytes (r, size * count);
886 for (i = 0; i < n_vars; ++i)
888 struct variable *v = dict_get_var (dict, i);
889 int measure = read_int (r);
890 int width = includes_width ? read_int (r) : 0;
891 int align = read_int (r);
893 /* SPSS 14 sometimes seems to set string variables' measure
895 if (0 == measure && var_is_alpha (v))
898 if (measure < 1 || measure > 3 || align < 0 || align > 2)
901 sys_warn (r, _("Invalid variable display parameters "
902 "for variable %zu (%s). "
903 "Default parameters substituted."),
904 i, var_get_name (v));
909 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
910 : measure == 2 ? MEASURE_ORDINAL
912 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
913 : align == 1 ? ALIGN_RIGHT
916 /* Older versions (SPSS 9.0) sometimes set the display
917 width to zero. This causes confusion in the GUI, so
918 only set the width if it is nonzero. */
920 var_set_display_width (v, width);
924 /* Reads record type 7, subtype 13, which gives the long name
925 that corresponds to each short name. Modifies variable names
926 in DICT accordingly. */
928 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
929 struct dictionary *dict)
931 struct variable_to_value_map *map;
932 struct variable *var;
936 map = open_variable_to_value_map (r, size * count);
937 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
941 size_t short_name_cnt;
944 /* Validate long name. */
945 if (!var_is_valid_name (long_name, false))
947 sys_warn (r, _("Long variable mapping from %s to invalid "
948 "variable name `%s'."),
949 var_get_name (var), long_name);
953 /* Identify any duplicates. */
954 if (strcasecmp (var_get_short_name (var, 0), long_name)
955 && dict_lookup_var (dict, long_name) != NULL)
957 sys_warn (r, _("Duplicate long variable name `%s' "
958 "within system file."), long_name);
962 /* Renaming a variable may clear its short names, but we
963 want to retain them, so we save them and re-set them
965 short_name_cnt = var_get_short_name_cnt (var);
966 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
967 for (i = 0; i < short_name_cnt; i++)
969 const char *s = var_get_short_name (var, i);
970 short_names[i] = s != NULL ? xstrdup (s) : NULL;
974 dict_rename_var (dict, var, long_name);
976 /* Restore short names. */
977 for (i = 0; i < short_name_cnt; i++)
979 var_set_short_name (var, i, short_names[i]);
980 free (short_names[i]);
984 close_variable_to_value_map (r, map);
985 r->has_long_var_names = true;
988 /* Reads record type 7, subtype 14, which gives the real length
989 of each very long string. Rearranges DICT accordingly. */
991 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
992 struct dictionary *dict)
994 struct variable_to_value_map *map;
995 struct variable *var;
999 map = open_variable_to_value_map (r, size * count);
1000 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
1003 size_t idx = var_get_dict_index (var);
1009 length = strtol (length_s, NULL, 10);
1010 if (length < 1 || length > MAX_STRING)
1012 sys_warn (r, _("%s listed as string of invalid length %s "
1013 "in very length string record."),
1014 var_get_name (var), length_s);
1018 /* Check segments. */
1019 segment_cnt = sfm_width_to_segments (length);
1020 if (segment_cnt == 1)
1022 sys_warn (r, _("%s listed in very long string record with width %s, "
1023 "which requires only one segment."),
1024 var_get_name (var), length_s);
1027 if (idx + segment_cnt > dict_get_var_cnt (dict))
1028 sys_error (r, _("Very long string %s overflows dictionary."),
1029 var_get_name (var));
1031 /* Get the short names from the segments and check their
1033 for (i = 0; i < segment_cnt; i++)
1035 struct variable *seg = dict_get_var (dict, idx + i);
1036 int alloc_width = sfm_segment_alloc_width (length, i);
1037 int width = var_get_width (seg);
1040 var_set_short_name (var, i, var_get_short_name (seg, 0));
1041 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1042 sys_error (r, _("Very long string with width %ld has segment %d "
1043 "of width %d (expected %d)"),
1044 length, i, width, alloc_width);
1046 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1047 var_set_width (var, length);
1049 close_variable_to_value_map (r, map);
1050 dict_compact_values (dict);
1053 /* Reads value labels from sysfile H and inserts them into the
1054 associated dictionary. */
1056 read_value_labels (struct sfm_reader *r,
1057 struct dictionary *dict, struct variable **var_by_value_idx)
1059 struct pool *subpool;
1063 char raw_value[8]; /* Value as uninterpreted bytes. */
1064 union value value; /* Value. */
1065 char *label; /* Null-terminated label string. */
1068 struct label *labels = NULL;
1069 int label_cnt; /* Number of labels. */
1071 struct variable **var = NULL; /* Associated variables. */
1072 int var_cnt; /* Number of associated variables. */
1076 subpool = pool_create_subpool (r->pool);
1078 /* Read the type 3 record and record its contents. We can't do
1079 much with the data yet because we don't know whether it is
1080 of numeric or string type. */
1082 /* Read number of labels. */
1083 label_cnt = read_int (r);
1085 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1087 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1092 /* Read each value/label tuple into labels[]. */
1093 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1094 for (i = 0; i < label_cnt; i++)
1096 struct label *label = labels + i;
1097 unsigned char label_len;
1101 read_bytes (r, label->raw_value, sizeof label->raw_value);
1103 /* Read label length. */
1104 read_bytes (r, &label_len, sizeof label_len);
1105 padded_len = ROUND_UP (label_len + 1, 8);
1107 /* Read label, padding. */
1108 label->label = pool_alloc (subpool, padded_len + 1);
1109 read_bytes (r, label->label, padded_len - 1);
1110 label->label[label_len] = 0;
1113 /* Now, read the type 4 record that has the list of variables
1114 to which the value labels are to be applied. */
1116 /* Read record type of type 4 record. */
1117 if (read_int (r) != 4)
1118 sys_error (r, _("Variable index record (type 4) does not immediately "
1119 "follow value label record (type 3) as it should."));
1121 /* Read number of variables associated with value label from type 4
1123 var_cnt = read_int (r);
1124 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1125 sys_error (r, _("Number of variables associated with a value label (%d) "
1126 "is not between 1 and the number of variables (%zu)."),
1127 var_cnt, dict_get_var_cnt (dict));
1129 /* Read the list of variables. */
1130 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1131 for (i = 0; i < var_cnt; i++)
1133 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1134 if (var_is_long_string (var[i]))
1135 sys_error (r, _("Value labels are not allowed on long string "
1136 "variables (%s)."), var_get_name (var[i]));
1139 /* Type check the variables. */
1140 for (i = 1; i < var_cnt; i++)
1141 if (var_get_type (var[i]) != var_get_type (var[0]))
1142 sys_error (r, _("Variables associated with value label are not all of "
1143 "identical type. Variable %s is %s, but variable "
1145 var_get_name (var[0]),
1146 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1147 var_get_name (var[i]),
1148 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1150 /* Fill in labels[].value, now that we know the desired type. */
1151 for (i = 0; i < label_cnt; i++)
1153 struct label *label = labels + i;
1155 if (var_is_alpha (var[0]))
1156 buf_copy_rpad (label->value.s, sizeof label->value.s,
1157 label->raw_value, sizeof label->raw_value);
1159 label->value.f = float_get_double (r->float_format, label->raw_value);
1162 /* Assign the `value_label's to each variable. */
1163 for (i = 0; i < var_cnt; i++)
1165 struct variable *v = var[i];
1168 /* Add each label to the variable. */
1169 for (j = 0; j < label_cnt; j++)
1171 struct label *label = &labels[j];
1172 if (!var_add_value_label (v, &label->value, label->label))
1174 if (var_is_numeric (var[0]))
1175 sys_warn (r, _("Duplicate value label for %g on %s."),
1176 label->value.f, var_get_name (v));
1178 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1179 var_get_width (v), label->value.s,
1185 pool_destroy (subpool);
1190 static void partial_record (struct sfm_reader *r)
1193 static void read_error (struct casereader *, const struct sfm_reader *);
1195 static bool read_case_number (struct sfm_reader *, double *);
1196 static bool read_case_string (struct sfm_reader *, char *, size_t);
1197 static int read_opcode (struct sfm_reader *);
1198 static bool read_compressed_number (struct sfm_reader *, double *);
1199 static bool read_compressed_string (struct sfm_reader *, char *);
1200 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1201 static bool skip_whole_strings (struct sfm_reader *, size_t);
1203 /* Reads one case from READER's file into C. Returns true only
1206 sys_file_casereader_read (struct casereader *reader, void *r_,
1209 struct sfm_reader *r = r_;
1215 case_create (c, r->value_cnt);
1216 if (setjmp (r->bail_out))
1218 casereader_force_error (reader);
1223 for (i = 0; i < r->sfm_var_cnt; i++)
1225 struct sfm_var *sv = &r->sfm_vars[i];
1226 union value *v = case_data_rw_idx (c, sv->case_index);
1230 if (!read_case_number (r, &v->f))
1235 if (!read_case_string (r, v->s + sv->offset, sv->width))
1237 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1247 if (r->case_cnt != -1)
1248 read_error (reader, r);
1252 /* Issues an error that R ends in a partial record. */
1254 partial_record (struct sfm_reader *r)
1256 sys_error (r, _("File ends in partial case."));
1259 /* Issues an error that an unspecified error occurred SFM, and
1262 read_error (struct casereader *r, const struct sfm_reader *sfm)
1264 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1265 casereader_force_error (r);
1268 /* Reads a number from R and stores its value in *D.
1269 If R is compressed, reads a compressed number;
1270 otherwise, reads a number in the regular way.
1271 Returns true if successful, false if end of file is
1272 reached immediately. */
1274 read_case_number (struct sfm_reader *r, double *d)
1279 if (!try_read_bytes (r, number, sizeof number))
1281 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1285 return read_compressed_number (r, d);
1288 /* Reads LENGTH string bytes from R into S.
1289 Always reads a multiple of 8 bytes; if LENGTH is not a
1290 multiple of 8, then extra bytes are read and discarded without
1292 Reads compressed strings if S is compressed.
1293 Returns true if successful, false if end of file is
1294 reached immediately. */
1296 read_case_string (struct sfm_reader *r, char *s, size_t length)
1298 size_t whole = ROUND_DOWN (length, 8);
1299 size_t partial = length % 8;
1303 if (!read_whole_strings (r, s, whole))
1310 if (!read_whole_strings (r, bounce, sizeof bounce))
1316 memcpy (s + whole, bounce, partial);
1322 /* Reads and returns the next compression opcode from R. */
1324 read_opcode (struct sfm_reader *r)
1326 assert (r->compressed);
1330 if (r->opcode_idx >= sizeof r->opcodes)
1332 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1336 opcode = r->opcodes[r->opcode_idx++];
1343 /* Reads a compressed number from R and stores its value in D.
1344 Returns true if successful, false if end of file is
1345 reached immediately. */
1347 read_compressed_number (struct sfm_reader *r, double *d)
1349 int opcode = read_opcode (r);
1357 *d = read_float (r);
1361 sys_error (r, _("Compressed data is corrupt."));
1368 *d = opcode - r->bias;
1375 /* Reads a compressed 8-byte string segment from R and stores it
1377 Returns true if successful, false if end of file is
1378 reached immediately. */
1380 read_compressed_string (struct sfm_reader *r, char *dst)
1382 switch (read_opcode (r))
1389 read_bytes (r, dst, 8);
1393 memset (dst, ' ', 8);
1397 sys_error (r, _("Compressed data is corrupt."));
1403 /* Reads LENGTH string bytes from R into S.
1404 LENGTH must be a multiple of 8.
1405 Reads compressed strings if S is compressed.
1406 Returns true if successful, false if end of file is
1407 reached immediately. */
1409 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1411 assert (length % 8 == 0);
1413 return try_read_bytes (r, s, length);
1417 for (ofs = 0; ofs < length; ofs += 8)
1418 if (!read_compressed_string (r, s + ofs))
1428 /* Skips LENGTH string bytes from R.
1429 LENGTH must be a multiple of 8.
1430 (LENGTH is also limited to 1024, but that's only because the
1431 current caller never needs more than that many bytes.)
1432 Returns true if successful, false if end of file is
1433 reached immediately. */
1435 skip_whole_strings (struct sfm_reader *r, size_t length)
1438 assert (length < sizeof buffer);
1439 return read_whole_strings (r, buffer, length);
1442 /* Creates and returns a table that can be used for translating a value
1443 index into a case to a "struct variable *" for DICT. Multiple
1444 system file fields reference variables this way.
1446 This table must be created before processing the very long
1447 string extension record, because that record causes some
1448 values to be deleted from the case and the dictionary to be
1450 static struct variable **
1451 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1453 struct variable **var_by_value_idx;
1457 var_by_value_idx = pool_nmalloc (r->pool,
1458 r->oct_cnt, sizeof *var_by_value_idx);
1459 for (i = 0; i < dict_get_var_cnt (dict); i++)
1461 struct variable *v = dict_get_var (dict, i);
1462 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1465 var_by_value_idx[value_idx++] = v;
1466 for (j = 1; j < nv; j++)
1467 var_by_value_idx[value_idx++] = NULL;
1469 assert (value_idx == r->oct_cnt);
1471 return var_by_value_idx;
1474 /* Returns the "struct variable" corresponding to the given
1475 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1477 static struct variable *
1478 lookup_var_by_value_idx (struct sfm_reader *r,
1479 struct variable **var_by_value_idx, int value_idx)
1481 struct variable *var;
1483 if (value_idx < 1 || value_idx > r->oct_cnt)
1484 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1485 value_idx, r->oct_cnt);
1487 var = var_by_value_idx[value_idx - 1];
1489 sys_error (r, _("Variable index %d refers to long string "
1496 /* Returns the variable in D with the given SHORT_NAME,
1497 or a null pointer if there is none. */
1498 static struct variable *
1499 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1501 struct variable *var;
1505 /* First try looking up by full name. This often succeeds. */
1506 var = dict_lookup_var (d, short_name);
1507 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
1510 /* Iterate through the whole dictionary as a fallback. */
1511 var_cnt = dict_get_var_cnt (d);
1512 for (i = 0; i < var_cnt; i++)
1514 var = dict_get_var (d, i);
1515 if (!strcasecmp (var_get_short_name (var, 0), short_name))
1522 /* Helpers for reading records that contain "variable=value"
1526 struct variable_to_value_map
1528 struct substring buffer; /* Record contents. */
1529 size_t pos; /* Current position in buffer. */
1532 /* Reads SIZE bytes into a "variable=value" map for R,
1533 and returns the map. */
1534 static struct variable_to_value_map *
1535 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1537 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1538 char *buffer = pool_malloc (r->pool, size + 1);
1539 read_bytes (r, buffer, size);
1540 map->buffer = ss_buffer (buffer, size);
1545 /* Closes MAP and frees its storage.
1546 Not really needed, because the pool will free the map anyway,
1547 but can be used to free it earlier. */
1549 close_variable_to_value_map (struct sfm_reader *r,
1550 struct variable_to_value_map *map)
1552 pool_free (r->pool, ss_data (map->buffer));
1555 /* Reads the next variable=value pair from MAP.
1556 Looks up the variable in DICT and stores it into *VAR.
1557 Stores a null-terminated value into *VALUE. */
1559 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1560 struct variable_to_value_map *map,
1561 struct variable **var, char **value,
1564 int max_warnings = 5;
1568 struct substring short_name_ss, value_ss;
1570 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1571 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1574 if (*warning_cnt > max_warnings)
1575 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1576 *warning_cnt - max_warnings);
1580 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1581 ss_buffer ("\t\0", 2));
1583 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1584 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1587 if (++*warning_cnt <= max_warnings)
1588 sys_warn (r, _("Variable map refers to unknown variable %s."),
1589 ss_data (short_name_ss));
1593 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1594 *value = ss_data (value_ss);
1602 /* Displays a corruption message. */
1604 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1609 ds_init_empty (&text);
1610 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1611 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1612 ds_put_vformat (&text, format, args);
1614 m.category = msg_class_to_category (class);
1615 m.severity = msg_class_to_severity (class);
1616 m.where.file_name = NULL;
1617 m.where.line_number = 0;
1618 m.text = ds_cstr (&text);
1623 /* Displays a warning for the current file position. */
1625 sys_warn (struct sfm_reader *r, const char *format, ...)
1629 va_start (args, format);
1630 sys_msg (r, MW, format, args);
1634 /* Displays an error for the current file position,
1635 marks it as in an error state,
1636 and aborts reading it using longjmp. */
1638 sys_error (struct sfm_reader *r, const char *format, ...)
1642 va_start (args, format);
1643 sys_msg (r, ME, format, args);
1647 longjmp (r->bail_out, 1);
1650 /* Reads BYTE_CNT bytes into BUF.
1651 Returns true if exactly BYTE_CNT bytes are successfully read.
1652 Aborts if an I/O error or a partial read occurs.
1653 If EOF_IS_OK, then an immediate end-of-file causes false to be
1654 returned; otherwise, immediate end-of-file causes an abort
1657 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1658 void *buf, size_t byte_cnt)
1660 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1661 if (bytes_read == byte_cnt)
1663 else if (ferror (r->file))
1664 sys_error (r, _("System error: %s."), strerror (errno));
1665 else if (!eof_is_ok || bytes_read != 0)
1666 sys_error (r, _("Unexpected end of file."));
1671 /* Reads BYTE_CNT into BUF.
1672 Aborts upon I/O error or if end-of-file is encountered. */
1674 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1676 read_bytes_internal (r, false, buf, byte_cnt);
1679 /* Reads BYTE_CNT bytes into BUF.
1680 Returns true if exactly BYTE_CNT bytes are successfully read.
1681 Returns false if an immediate end-of-file is encountered.
1682 Aborts if an I/O error or a partial read occurs. */
1684 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1686 return read_bytes_internal (r, true, buf, byte_cnt);
1689 /* Reads a 32-bit signed integer from R and returns its value in
1692 read_int (struct sfm_reader *r)
1695 read_bytes (r, integer, sizeof integer);
1696 return integer_get (r->integer_format, integer, sizeof integer);
1699 /* Reads a 64-bit floating-point number from R and returns its
1700 value in host format. */
1702 read_float (struct sfm_reader *r)
1705 read_bytes (r, number, sizeof number);
1706 return float_get_double (r->float_format, number);
1709 /* Reads exactly SIZE - 1 bytes into BUFFER
1710 and stores a null byte into BUFFER[SIZE - 1]. */
1712 read_string (struct sfm_reader *r, char *buffer, size_t size)
1715 read_bytes (r, buffer, size - 1);
1716 buffer[size - 1] = '\0';
1719 /* Skips BYTES bytes forward in R. */
1721 skip_bytes (struct sfm_reader *r, size_t bytes)
1726 size_t chunk = MIN (sizeof buffer, bytes);
1727 read_bytes (r, buffer, chunk);
1732 static const struct casereader_class sys_file_casereader_class =
1734 sys_file_casereader_read,
1735 sys_file_casereader_destroy,