1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/assertion.h>
29 #include <libpspp/message.h>
30 #include <libpspp/compiler.h>
31 #include <libpspp/misc.h>
32 #include <libpspp/pool.h>
33 #include <libpspp/str.h>
34 #include <libpspp/hash.h>
35 #include <libpspp/array.h>
37 #include <data/case.h>
38 #include <data/casereader-provider.h>
39 #include <data/casereader.h>
40 #include <data/dictionary.h>
41 #include <data/file-handle-def.h>
42 #include <data/file-name.h>
43 #include <data/format.h>
44 #include <data/missing-values.h>
45 #include <data/short-names.h>
46 #include <data/value-labels.h>
47 #include <data/variable.h>
48 #include <data/value.h>
53 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 struct fh_lock *lock; /* Mutual exclusion for file handle. */
71 FILE *file; /* File stream. */
72 bool error; /* I/O or corruption error? */
73 size_t value_cnt; /* Number of "union value"s in struct case. */
76 enum integer_format integer_format; /* On-disk integer format. */
77 enum float_format float_format; /* On-disk floating point format. */
78 int oct_cnt; /* Number of 8-byte units per case. */
79 struct sfm_var *sfm_vars; /* Variables. */
80 size_t sfm_var_cnt; /* Number of variables. */
81 casenumber case_cnt; /* Number of cases */
82 bool has_long_var_names; /* File has a long variable name map */
85 bool compressed; /* File is compressed? */
86 double bias; /* Compression bias, usually 100.0. */
87 uint8_t opcodes[8]; /* Current block of opcodes. */
88 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 static const struct casereader_class sys_file_casereader_class;
93 static bool close_reader (struct sfm_reader *);
95 static struct variable **make_var_by_value_idx (struct sfm_reader *,
97 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
101 static void sys_warn (struct sfm_reader *, const char *, ...)
102 PRINTF_FORMAT (2, 3);
104 static void sys_error (struct sfm_reader *, const char *, ...)
108 static void read_bytes (struct sfm_reader *, void *, size_t);
109 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
110 static int read_int (struct sfm_reader *);
111 static double read_float (struct sfm_reader *);
112 static void read_string (struct sfm_reader *, char *, size_t);
113 static void skip_bytes (struct sfm_reader *, size_t);
115 static struct variable_to_value_map *open_variable_to_value_map (
116 struct sfm_reader *, size_t size);
117 static void close_variable_to_value_map (struct sfm_reader *r,
118 struct variable_to_value_map *);
119 static bool read_variable_to_value_map (struct sfm_reader *,
121 struct variable_to_value_map *,
122 struct variable **var, char **value,
125 static bool close_reader (struct sfm_reader *r);
127 /* Dictionary reader. */
135 static void read_header (struct sfm_reader *, struct dictionary *,
136 int *weight_idx, int *claimed_oct_cnt,
137 struct sfm_read_info *);
138 static void read_variable_record (struct sfm_reader *, struct dictionary *,
139 int *format_warning_cnt);
140 static void parse_format_spec (struct sfm_reader *, unsigned int,
141 enum which_format, struct variable *,
142 int *format_warning_cnt);
143 static void setup_weight (struct sfm_reader *, int weight_idx,
144 struct variable **var_by_value_idx,
145 struct dictionary *);
146 static void read_documents (struct sfm_reader *, struct dictionary *);
147 static void read_value_labels (struct sfm_reader *, struct dictionary *,
148 struct variable **var_by_value_idx);
150 static void read_extension_record (struct sfm_reader *, struct dictionary *,
151 struct sfm_read_info *);
152 static void read_machine_integer_info (struct sfm_reader *,
153 size_t size, size_t count,
154 struct sfm_read_info *);
155 static void read_machine_float_info (struct sfm_reader *,
156 size_t size, size_t count);
157 static void read_display_parameters (struct sfm_reader *,
158 size_t size, size_t count,
159 struct dictionary *);
160 static void read_long_var_name_map (struct sfm_reader *,
161 size_t size, size_t count,
162 struct dictionary *);
163 static void read_long_string_map (struct sfm_reader *,
164 size_t size, size_t count,
165 struct dictionary *);
168 /* Opens the system file designated by file handle FH for
169 reading. Reads the system file's dictionary into *DICT.
170 If INFO is non-null, then it receives additional info about the
173 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
174 struct sfm_read_info *volatile info)
176 struct sfm_reader *volatile r = NULL;
177 struct variable **var_by_value_idx;
178 struct sfm_read_info local_info;
179 int format_warning_cnt = 0;
184 *dict = dict_create ();
186 /* Create and initialize reader. */
187 r = pool_create_container (struct sfm_reader, pool);
193 r->has_long_var_names = false;
194 r->opcode_idx = sizeof r->opcodes;
196 /* TRANSLATORS: this fragment will be interpolated into
197 messages in fh_lock() that identify types of files. */
198 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
202 r->file = fn_open (fh_get_file_name (fh), "rb");
205 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
206 fh_get_file_name (r->fh), strerror (errno));
210 /* Initialize info. */
213 memset (info, 0, sizeof *info);
215 if (setjmp (r->bail_out))
220 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
222 /* Read all the variable definition records. */
223 rec_type = read_int (r);
224 while (rec_type == 2)
226 read_variable_record (r, *dict, &format_warning_cnt);
227 rec_type = read_int (r);
230 /* Figure out the case format. */
231 var_by_value_idx = make_var_by_value_idx (r, *dict);
232 setup_weight (r, weight_idx, var_by_value_idx, *dict);
234 /* Read all the rest of the dictionary records. */
235 while (rec_type != 999)
240 read_value_labels (r, *dict, var_by_value_idx);
244 sys_error (r, _("Misplaced type 4 record."));
247 read_documents (r, *dict);
251 read_extension_record (r, *dict, info);
255 sys_error (r, _("Unrecognized record type %d."), rec_type);
257 rec_type = read_int (r);
261 if ( ! r->has_long_var_names )
264 for (i = 0; i < dict_get_var_cnt (*dict); i++)
266 struct variable *var = dict_get_var (*dict, i);
267 char short_name[SHORT_NAME_LEN + 1];
268 char long_name[SHORT_NAME_LEN + 1];
270 strcpy (short_name, var_get_name (var));
272 strcpy (long_name, short_name);
273 str_lowercase (long_name);
275 /* Set long name. Renaming a variable may clear the short
276 name, but we want to retain it, so re-set it
278 dict_rename_var (*dict, var, long_name);
279 var_set_short_name (var, 0, short_name);
282 r->has_long_var_names = true;
285 /* Read record 999 data, which is just filler. */
288 /* Warn if the actual amount of data per case differs from the
289 amount that the header claims. SPSS version 13 gets this
290 wrong when very long strings are involved, so don't warn in
292 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
293 && info->version_major != 13)
294 sys_warn (r, _("File header claims %d variable positions but "
295 "%d were read from file."),
296 claimed_oct_cnt, r->oct_cnt);
298 /* Create an index of dictionary variable widths for
299 sfm_read_case to use. We cannot use the `struct variable's
300 from the dictionary we created, because the caller owns the
301 dictionary and may destroy or modify its variables. */
302 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
303 pool_register (r->pool, free, r->sfm_vars);
305 pool_free (r->pool, var_by_value_idx);
306 r->value_cnt = dict_get_next_value_idx (*dict);
307 return casereader_create_sequential
309 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
310 &sys_file_casereader_class, r);
314 dict_destroy (*dict);
319 /* Closes a system file after we're done with it.
320 Returns true if an I/O error has occurred on READER, false
323 close_reader (struct sfm_reader *r)
332 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
334 msg (ME, _("Error closing system file \"%s\": %s."),
335 fh_get_file_name (r->fh), strerror (errno));
345 pool_destroy (r->pool);
350 /* Destroys READER. */
352 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
354 struct sfm_reader *r = r_;
358 /* Returns true if FILE is an SPSS system file,
361 sfm_detect (FILE *file)
365 if (fread (rec_type, 4, 1, file) != 1)
369 return !strcmp ("$FL2", rec_type);
372 /* Reads the global header of the system file.
373 Sets DICT's file label to the system file's label.
374 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
375 or to the value index of the weight variable otherwise.
376 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
377 per case that the file claims to have (although it is not
379 Initializes INFO with header information. */
381 read_header (struct sfm_reader *r, struct dictionary *dict,
382 int *weight_idx, int *claimed_oct_cnt,
383 struct sfm_read_info *info)
386 char eye_catcher[61];
387 uint8_t raw_layout_code[4];
389 char creation_date[10];
390 char creation_time[9];
392 struct substring file_label_ss;
393 struct substring product;
395 read_string (r, rec_type, sizeof rec_type);
396 read_string (r, eye_catcher, sizeof eye_catcher);
398 if (strcmp ("$FL2", rec_type) != 0)
399 sys_error (r, _("This is not an SPSS system file."));
401 /* Identify integer format. */
402 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
403 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
405 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
407 || (r->integer_format != INTEGER_MSB_FIRST
408 && r->integer_format != INTEGER_LSB_FIRST))
409 sys_error (r, _("This is not an SPSS system file."));
411 *claimed_oct_cnt = read_int (r);
412 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
413 *claimed_oct_cnt = -1;
415 r->compressed = read_int (r) != 0;
417 *weight_idx = read_int (r);
419 r->case_cnt = read_int (r);
420 if ( r->case_cnt > INT_MAX / 2)
424 /* Identify floating-point format and obtain compression bias. */
425 read_bytes (r, raw_bias, sizeof raw_bias);
426 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
428 sys_warn (r, _("Compression bias is not the usual "
429 "value of 100, or system file uses unrecognized "
430 "floating-point format."));
431 if (r->integer_format == INTEGER_MSB_FIRST)
432 r->float_format = FLOAT_IEEE_DOUBLE_BE;
434 r->float_format = FLOAT_IEEE_DOUBLE_LE;
436 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
438 read_string (r, creation_date, sizeof creation_date);
439 read_string (r, creation_time, sizeof creation_time);
440 read_string (r, file_label, sizeof file_label);
443 file_label_ss = ss_cstr (file_label);
444 ss_trim (&file_label_ss, ss_cstr (" "));
445 if (!ss_is_empty (file_label_ss))
447 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
448 dict_set_label (dict, ss_data (file_label_ss));
451 strcpy (info->creation_date, creation_date);
452 strcpy (info->creation_time, creation_time);
453 info->integer_format = r->integer_format;
454 info->float_format = r->float_format;
455 info->compressed = r->compressed;
456 info->case_cnt = r->case_cnt;
458 product = ss_cstr (eye_catcher);
459 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
460 ss_trim (&product, ss_cstr (" "));
461 str_copy_buf_trunc (info->product, sizeof info->product,
462 ss_data (product), ss_length (product));
465 /* Reads a variable (type 2) record from R and adds the
466 corresponding variable to DICT.
467 Also skips past additional variable records for long string
470 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
471 int *format_warning_cnt)
474 int has_variable_label;
475 int missing_value_code;
480 struct variable *var;
483 width = read_int (r);
484 has_variable_label = read_int (r);
485 missing_value_code = read_int (r);
486 print_format = read_int (r);
487 write_format = read_int (r);
488 read_string (r, name, sizeof name);
489 name[strcspn (name, " ")] = '\0';
491 /* Check variable name. */
492 if (name[0] == '$' || name[0] == '#')
493 sys_error (r, "Variable name begins with invalid character `%c'.",
495 if (!var_is_plausible_name (name, false))
496 sys_error (r, _("Invalid variable name `%s'."), name);
498 /* Create variable. */
499 if (width < 0 || width > 255)
500 sys_error (r, _("Bad variable width %d."), width);
501 var = dict_create_var (dict, name, width);
504 _("Duplicate variable name `%s' within system file."),
507 /* Set the short name the same as the long name. */
508 var_set_short_name (var, 0, var_get_name (var));
510 /* Get variable label, if any. */
511 if (has_variable_label != 0 && has_variable_label != 1)
512 sys_error (r, _("Variable label indicator field is not 0 or 1."));
513 if (has_variable_label == 1)
519 if (len >= sizeof label)
520 sys_error (r, _("Variable %s has label of invalid length %zu."),
522 read_string (r, label, len + 1);
523 var_set_label (var, label);
525 skip_bytes (r, ROUND_UP (len, 4) - len);
528 /* Set missing values. */
529 if (missing_value_code != 0)
531 struct missing_values mv;
534 mv_init (&mv, var_get_width (var));
535 if (var_is_numeric (var))
537 if (missing_value_code < -3 || missing_value_code > 3
538 || missing_value_code == -1)
539 sys_error (r, _("Numeric missing value indicator field is not "
540 "-3, -2, 0, 1, 2, or 3."));
541 if (missing_value_code < 0)
543 double low = read_float (r);
544 double high = read_float (r);
545 mv_add_range (&mv, low, high);
546 missing_value_code = -missing_value_code - 2;
548 for (i = 0; i < missing_value_code; i++)
549 mv_add_num (&mv, read_float (r));
553 if (missing_value_code < 1 || missing_value_code > 3)
554 sys_error (r, _("String missing value indicator field is not "
556 if (var_is_long_string (var))
557 sys_warn (r, _("Ignoring missing values on long string variable "
558 "%s, which PSPP does not yet support."), name);
559 for (i = 0; i < missing_value_code; i++)
562 read_string (r, string, sizeof string);
563 mv_add_str (&mv, string);
566 if (!var_is_long_string (var))
567 var_set_missing_values (var, &mv);
571 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
572 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
574 /* Account for values.
575 Skip long string continuation records, if any. */
576 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
582 for (i = 1; i < nv; i++)
584 /* Check for record type 2 and width -1. */
585 if (read_int (r) != 2 || read_int (r) != -1)
586 sys_error (r, _("Missing string continuation record."));
588 /* Skip and ignore remaining continuation data. */
589 has_variable_label = read_int (r);
590 missing_value_code = read_int (r);
591 print_format = read_int (r);
592 write_format = read_int (r);
593 read_string (r, name, sizeof name);
595 /* Variable label fields on continuation records have
596 been spotted in system files created by "SPSS Power
597 Macintosh Release 6.1". */
598 if (has_variable_label)
599 skip_bytes (r, ROUND_UP (read_int (r), 4));
604 /* Translates the format spec from sysfile format to internal
607 parse_format_spec (struct sfm_reader *r, unsigned int s,
608 enum which_format which, struct variable *v,
609 int *format_warning_cnt)
611 const int max_format_warnings = 8;
613 uint8_t raw_type = s >> 16;
619 if (!fmt_from_io (raw_type, &f.type))
620 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
625 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
630 if (which == PRINT_FORMAT)
631 var_set_print_format (v, &f);
633 var_set_write_format (v, &f);
635 else if (*++format_warning_cnt <= max_format_warnings)
637 char fmt_string[FMT_STRING_LEN_MAX + 1];
638 sys_warn (r, _("%s variable %s has invalid %s format %s."),
639 var_is_numeric (v) ? _("Numeric") : _("String"),
641 which == PRINT_FORMAT ? _("print") : _("write"),
642 fmt_to_string (&f, fmt_string));
644 if (*format_warning_cnt == max_format_warnings)
645 sys_warn (r, _("Suppressing further invalid format warnings."));
649 /* Sets the weighting variable in DICT to the variable
650 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
653 setup_weight (struct sfm_reader *r, int weight_idx,
654 struct variable **var_by_value_idx, struct dictionary *dict)
658 struct variable *weight_var
659 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
660 if (var_is_numeric (weight_var))
661 dict_set_weight (dict, weight_var);
663 sys_error (r, _("Weighting variable must be numeric."));
667 /* Reads a document record, type 6, from system file R, and sets up
668 the documents and n_documents fields in the associated
671 read_documents (struct sfm_reader *r, struct dictionary *dict)
676 if (dict_get_documents (dict) != NULL)
677 sys_error (r, _("Multiple type 6 (document) records."));
679 line_cnt = read_int (r);
681 sys_error (r, _("Number of document lines (%d) "
682 "must be greater than 0."), line_cnt);
684 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
685 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
686 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
687 dict_set_documents (dict, documents);
689 sys_error (r, _("Document line contains null byte."));
690 pool_free (r->pool, documents);
693 /* Read a type 7 extension record. */
695 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
696 struct sfm_read_info *info)
698 int subtype = read_int (r);
699 size_t size = read_int (r);
700 size_t count = read_int (r);
701 size_t bytes = size * count;
703 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
704 allows an extra byte for a null terminator, used by some
705 extension processing routines. */
706 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
707 sys_error (r, "Record type 7 subtype %d too large.", subtype);
712 read_machine_integer_info (r, size, count, info);
716 read_machine_float_info (r, size, count);
720 /* Variable sets information. We don't use these yet.
721 They only apply to GUIs; see VARSETS on the APPLY
722 DICTIONARY command in SPSS documentation. */
726 /* DATE variable information. We don't use it yet, but we
731 /* Used by the MRSETS command. */
735 /* Used by the SPSS Data Entry software. */
739 read_display_parameters (r, size, count, dict);
743 read_long_var_name_map (r, size, count, dict);
747 read_long_string_map (r, size, count, dict);
751 /* New in SPSS v14? Unknown purpose. */
755 /* Text field that defines variable attributes. New in
760 /* New in SPSS 16. Contains a single string that describes
761 the character encoding, e.g. "windows-1252". */
765 /* New in SPSS 16. Encodes value labels for long string
767 sys_warn (r, _("Ignoring value labels for long string variables, "
768 "which PSPP does not yet support."));
772 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"),
773 subtype, PACKAGE_BUGREPORT);
777 skip_bytes (r, bytes);
780 /* Read record type 7, subtype 3. */
782 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
783 struct sfm_read_info *info)
785 int version_major = read_int (r);
786 int version_minor = read_int (r);
787 int version_revision = read_int (r);
788 int machine_code UNUSED = read_int (r);
789 int float_representation = read_int (r);
790 int compression_code UNUSED = read_int (r);
791 int integer_representation = read_int (r);
792 int character_code UNUSED = read_int (r);
794 int expected_float_format;
795 int expected_integer_format;
797 if (size != 4 || count != 8)
798 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
802 /* Save version info. */
803 info->version_major = version_major;
804 info->version_minor = version_minor;
805 info->version_revision = version_revision;
807 /* Check floating point format. */
808 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
809 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
810 expected_float_format = 1;
811 else if (r->float_format == FLOAT_Z_LONG)
812 expected_float_format = 2;
813 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
814 expected_float_format = 3;
817 if (float_representation != expected_float_format)
818 sys_error (r, _("Floating-point representation indicated by "
819 "system file (%d) differs from expected (%d)."),
820 r->float_format, expected_float_format);
822 /* Check integer format. */
823 if (r->integer_format == INTEGER_MSB_FIRST)
824 expected_integer_format = 1;
825 else if (r->integer_format == INTEGER_LSB_FIRST)
826 expected_integer_format = 2;
829 if (integer_representation != expected_integer_format)
831 static const char *const endian[] = {N_("little-endian"), N_("big-endian")};
832 sys_warn (r, _("Integer format indicated by system file (%s) "
833 "differs from expected (%s)."),
834 gettext (endian[integer_representation == 1]),
835 gettext (endian[expected_integer_format == 1]));
839 /* Read record type 7, subtype 4. */
841 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
843 double sysmis = read_float (r);
844 double highest = read_float (r);
845 double lowest = read_float (r);
847 if (size != 8 || count != 3)
848 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
851 if (sysmis != SYSMIS)
852 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
853 if (highest != HIGHEST)
854 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
855 if (lowest != LOWEST)
856 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
859 /* Read record type 7, subtype 11, which specifies how variables
860 should be displayed in GUI environments. */
862 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
863 struct dictionary *dict)
872 sys_warn (r, _("Bad size %zu on extension 11."), size);
873 skip_bytes (r, size * count);
877 n_vars = dict_get_var_cnt (dict);
878 if (count == 3 * n_vars)
879 includes_width = true;
880 else if (count == 2 * n_vars)
881 includes_width = false;
884 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
886 skip_bytes (r, size * count);
890 for (i = 0; i < n_vars; ++i)
892 struct variable *v = dict_get_var (dict, i);
893 int measure = read_int (r);
894 int width = includes_width ? read_int (r) : 0;
895 int align = read_int (r);
897 /* SPSS 14 sometimes seems to set string variables' measure
899 if (0 == measure && var_is_alpha (v))
902 if (measure < 1 || measure > 3 || align < 0 || align > 2)
905 sys_warn (r, _("Invalid variable display parameters "
906 "for variable %zu (%s). "
907 "Default parameters substituted."),
908 i, var_get_name (v));
913 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
914 : measure == 2 ? MEASURE_ORDINAL
916 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
917 : align == 1 ? ALIGN_RIGHT
920 /* Older versions (SPSS 9.0) sometimes set the display
921 width to zero. This causes confusion in the GUI, so
922 only set the width if it is nonzero. */
924 var_set_display_width (v, width);
928 /* Reads record type 7, subtype 13, which gives the long name
929 that corresponds to each short name. Modifies variable names
930 in DICT accordingly. */
932 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
933 struct dictionary *dict)
935 struct variable_to_value_map *map;
936 struct variable *var;
940 map = open_variable_to_value_map (r, size * count);
941 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
945 size_t short_name_cnt;
948 /* Validate long name. */
949 if (!var_is_valid_name (long_name, false))
951 sys_warn (r, _("Long variable mapping from %s to invalid "
952 "variable name `%s'."),
953 var_get_name (var), long_name);
957 /* Identify any duplicates. */
958 if (strcasecmp (var_get_short_name (var, 0), long_name)
959 && dict_lookup_var (dict, long_name) != NULL)
961 sys_warn (r, _("Duplicate long variable name `%s' "
962 "within system file."), long_name);
966 /* Renaming a variable may clear its short names, but we
967 want to retain them, so we save them and re-set them
969 short_name_cnt = var_get_short_name_cnt (var);
970 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
971 for (i = 0; i < short_name_cnt; i++)
973 const char *s = var_get_short_name (var, i);
974 short_names[i] = s != NULL ? xstrdup (s) : NULL;
978 dict_rename_var (dict, var, long_name);
980 /* Restore short names. */
981 for (i = 0; i < short_name_cnt; i++)
983 var_set_short_name (var, i, short_names[i]);
984 free (short_names[i]);
988 close_variable_to_value_map (r, map);
989 r->has_long_var_names = true;
992 /* Reads record type 7, subtype 14, which gives the real length
993 of each very long string. Rearranges DICT accordingly. */
995 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
996 struct dictionary *dict)
998 struct variable_to_value_map *map;
999 struct variable *var;
1001 int warning_cnt = 0;
1003 map = open_variable_to_value_map (r, size * count);
1004 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
1007 size_t idx = var_get_dict_index (var);
1013 length = strtol (length_s, NULL, 10);
1014 if (length < 1 || length > MAX_STRING)
1016 sys_warn (r, _("%s listed as string of invalid length %s "
1017 "in very length string record."),
1018 var_get_name (var), length_s);
1022 /* Check segments. */
1023 segment_cnt = sfm_width_to_segments (length);
1024 if (segment_cnt == 1)
1026 sys_warn (r, _("%s listed in very long string record with width %s, "
1027 "which requires only one segment."),
1028 var_get_name (var), length_s);
1031 if (idx + segment_cnt > dict_get_var_cnt (dict))
1032 sys_error (r, _("Very long string %s overflows dictionary."),
1033 var_get_name (var));
1035 /* Get the short names from the segments and check their
1037 for (i = 0; i < segment_cnt; i++)
1039 struct variable *seg = dict_get_var (dict, idx + i);
1040 int alloc_width = sfm_segment_alloc_width (length, i);
1041 int width = var_get_width (seg);
1044 var_set_short_name (var, i, var_get_short_name (seg, 0));
1045 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1046 sys_error (r, _("Very long string with width %ld has segment %d "
1047 "of width %d (expected %d)"),
1048 length, i, width, alloc_width);
1050 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1051 var_set_width (var, length);
1053 close_variable_to_value_map (r, map);
1054 dict_compact_values (dict);
1057 /* Reads value labels from sysfile H and inserts them into the
1058 associated dictionary. */
1060 read_value_labels (struct sfm_reader *r,
1061 struct dictionary *dict, struct variable **var_by_value_idx)
1063 struct pool *subpool;
1067 char raw_value[8]; /* Value as uninterpreted bytes. */
1068 union value value; /* Value. */
1069 char *label; /* Null-terminated label string. */
1072 struct label *labels = NULL;
1073 int label_cnt; /* Number of labels. */
1075 struct variable **var = NULL; /* Associated variables. */
1076 int var_cnt; /* Number of associated variables. */
1080 subpool = pool_create_subpool (r->pool);
1082 /* Read the type 3 record and record its contents. We can't do
1083 much with the data yet because we don't know whether it is
1084 of numeric or string type. */
1086 /* Read number of labels. */
1087 label_cnt = read_int (r);
1089 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1091 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1096 /* Read each value/label tuple into labels[]. */
1097 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1098 for (i = 0; i < label_cnt; i++)
1100 struct label *label = labels + i;
1101 unsigned char label_len;
1105 read_bytes (r, label->raw_value, sizeof label->raw_value);
1107 /* Read label length. */
1108 read_bytes (r, &label_len, sizeof label_len);
1109 padded_len = ROUND_UP (label_len + 1, 8);
1111 /* Read label, padding. */
1112 label->label = pool_alloc (subpool, padded_len + 1);
1113 read_bytes (r, label->label, padded_len - 1);
1114 label->label[label_len] = 0;
1117 /* Now, read the type 4 record that has the list of variables
1118 to which the value labels are to be applied. */
1120 /* Read record type of type 4 record. */
1121 if (read_int (r) != 4)
1122 sys_error (r, _("Variable index record (type 4) does not immediately "
1123 "follow value label record (type 3) as it should."));
1125 /* Read number of variables associated with value label from type 4
1127 var_cnt = read_int (r);
1128 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1129 sys_error (r, _("Number of variables associated with a value label (%d) "
1130 "is not between 1 and the number of variables (%zu)."),
1131 var_cnt, dict_get_var_cnt (dict));
1133 /* Read the list of variables. */
1134 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1135 for (i = 0; i < var_cnt; i++)
1137 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1138 if (var_is_long_string (var[i]))
1139 sys_error (r, _("Value labels are not allowed on long string "
1140 "variables (%s)."), var_get_name (var[i]));
1143 /* Type check the variables. */
1144 for (i = 1; i < var_cnt; i++)
1145 if (var_get_type (var[i]) != var_get_type (var[0]))
1146 sys_error (r, _("Variables associated with value label are not all of "
1147 "identical type. Variable %s is %s, but variable "
1149 var_get_name (var[0]),
1150 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1151 var_get_name (var[i]),
1152 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1154 /* Fill in labels[].value, now that we know the desired type. */
1155 for (i = 0; i < label_cnt; i++)
1157 struct label *label = labels + i;
1159 if (var_is_alpha (var[0]))
1160 buf_copy_rpad (label->value.s, sizeof label->value.s,
1161 label->raw_value, sizeof label->raw_value);
1163 label->value.f = float_get_double (r->float_format, label->raw_value);
1166 /* Assign the `value_label's to each variable. */
1167 for (i = 0; i < var_cnt; i++)
1169 struct variable *v = var[i];
1172 /* Add each label to the variable. */
1173 for (j = 0; j < label_cnt; j++)
1175 struct label *label = &labels[j];
1176 if (!var_add_value_label (v, &label->value, label->label))
1178 if (var_is_numeric (var[0]))
1179 sys_warn (r, _("Duplicate value label for %g on %s."),
1180 label->value.f, var_get_name (v));
1182 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1183 var_get_width (v), label->value.s,
1189 pool_destroy (subpool);
1194 static void partial_record (struct sfm_reader *r)
1197 static void read_error (struct casereader *, const struct sfm_reader *);
1199 static bool read_case_number (struct sfm_reader *, double *);
1200 static bool read_case_string (struct sfm_reader *, char *, size_t);
1201 static int read_opcode (struct sfm_reader *);
1202 static bool read_compressed_number (struct sfm_reader *, double *);
1203 static bool read_compressed_string (struct sfm_reader *, char *);
1204 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1205 static bool skip_whole_strings (struct sfm_reader *, size_t);
1207 /* Reads one case from READER's file into C. Returns true only
1210 sys_file_casereader_read (struct casereader *reader, void *r_,
1213 struct sfm_reader *r = r_;
1219 case_create (c, r->value_cnt);
1220 if (setjmp (r->bail_out))
1222 casereader_force_error (reader);
1227 for (i = 0; i < r->sfm_var_cnt; i++)
1229 struct sfm_var *sv = &r->sfm_vars[i];
1230 union value *v = case_data_rw_idx (c, sv->case_index);
1234 if (!read_case_number (r, &v->f))
1239 if (!read_case_string (r, v->s + sv->offset, sv->width))
1241 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1251 if (r->case_cnt != -1)
1252 read_error (reader, r);
1256 /* Issues an error that R ends in a partial record. */
1258 partial_record (struct sfm_reader *r)
1260 sys_error (r, _("File ends in partial case."));
1263 /* Issues an error that an unspecified error occurred SFM, and
1266 read_error (struct casereader *r, const struct sfm_reader *sfm)
1268 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1269 casereader_force_error (r);
1272 /* Reads a number from R and stores its value in *D.
1273 If R is compressed, reads a compressed number;
1274 otherwise, reads a number in the regular way.
1275 Returns true if successful, false if end of file is
1276 reached immediately. */
1278 read_case_number (struct sfm_reader *r, double *d)
1283 if (!try_read_bytes (r, number, sizeof number))
1285 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1289 return read_compressed_number (r, d);
1292 /* Reads LENGTH string bytes from R into S.
1293 Always reads a multiple of 8 bytes; if LENGTH is not a
1294 multiple of 8, then extra bytes are read and discarded without
1296 Reads compressed strings if S is compressed.
1297 Returns true if successful, false if end of file is
1298 reached immediately. */
1300 read_case_string (struct sfm_reader *r, char *s, size_t length)
1302 size_t whole = ROUND_DOWN (length, 8);
1303 size_t partial = length % 8;
1307 if (!read_whole_strings (r, s, whole))
1314 if (!read_whole_strings (r, bounce, sizeof bounce))
1320 memcpy (s + whole, bounce, partial);
1326 /* Reads and returns the next compression opcode from R. */
1328 read_opcode (struct sfm_reader *r)
1330 assert (r->compressed);
1334 if (r->opcode_idx >= sizeof r->opcodes)
1336 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1340 opcode = r->opcodes[r->opcode_idx++];
1347 /* Reads a compressed number from R and stores its value in D.
1348 Returns true if successful, false if end of file is
1349 reached immediately. */
1351 read_compressed_number (struct sfm_reader *r, double *d)
1353 int opcode = read_opcode (r);
1361 *d = read_float (r);
1365 sys_error (r, _("Compressed data is corrupt."));
1372 *d = opcode - r->bias;
1379 /* Reads a compressed 8-byte string segment from R and stores it
1381 Returns true if successful, false if end of file is
1382 reached immediately. */
1384 read_compressed_string (struct sfm_reader *r, char *dst)
1386 switch (read_opcode (r))
1393 read_bytes (r, dst, 8);
1397 memset (dst, ' ', 8);
1401 sys_error (r, _("Compressed data is corrupt."));
1407 /* Reads LENGTH string bytes from R into S.
1408 LENGTH must be a multiple of 8.
1409 Reads compressed strings if S is compressed.
1410 Returns true if successful, false if end of file is
1411 reached immediately. */
1413 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1415 assert (length % 8 == 0);
1417 return try_read_bytes (r, s, length);
1421 for (ofs = 0; ofs < length; ofs += 8)
1422 if (!read_compressed_string (r, s + ofs))
1432 /* Skips LENGTH string bytes from R.
1433 LENGTH must be a multiple of 8.
1434 (LENGTH is also limited to 1024, but that's only because the
1435 current caller never needs more than that many bytes.)
1436 Returns true if successful, false if end of file is
1437 reached immediately. */
1439 skip_whole_strings (struct sfm_reader *r, size_t length)
1442 assert (length < sizeof buffer);
1443 return read_whole_strings (r, buffer, length);
1446 /* Creates and returns a table that can be used for translating a value
1447 index into a case to a "struct variable *" for DICT. Multiple
1448 system file fields reference variables this way.
1450 This table must be created before processing the very long
1451 string extension record, because that record causes some
1452 values to be deleted from the case and the dictionary to be
1454 static struct variable **
1455 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1457 struct variable **var_by_value_idx;
1461 var_by_value_idx = pool_nmalloc (r->pool,
1462 r->oct_cnt, sizeof *var_by_value_idx);
1463 for (i = 0; i < dict_get_var_cnt (dict); i++)
1465 struct variable *v = dict_get_var (dict, i);
1466 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1469 var_by_value_idx[value_idx++] = v;
1470 for (j = 1; j < nv; j++)
1471 var_by_value_idx[value_idx++] = NULL;
1473 assert (value_idx == r->oct_cnt);
1475 return var_by_value_idx;
1478 /* Returns the "struct variable" corresponding to the given
1479 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1481 static struct variable *
1482 lookup_var_by_value_idx (struct sfm_reader *r,
1483 struct variable **var_by_value_idx, int value_idx)
1485 struct variable *var;
1487 if (value_idx < 1 || value_idx > r->oct_cnt)
1488 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1489 value_idx, r->oct_cnt);
1491 var = var_by_value_idx[value_idx - 1];
1493 sys_error (r, _("Variable index %d refers to long string "
1500 /* Returns the variable in D with the given SHORT_NAME,
1501 or a null pointer if there is none. */
1502 static struct variable *
1503 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1505 struct variable *var;
1509 /* First try looking up by full name. This often succeeds. */
1510 var = dict_lookup_var (d, short_name);
1511 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
1514 /* Iterate through the whole dictionary as a fallback. */
1515 var_cnt = dict_get_var_cnt (d);
1516 for (i = 0; i < var_cnt; i++)
1518 var = dict_get_var (d, i);
1519 if (!strcasecmp (var_get_short_name (var, 0), short_name))
1526 /* Helpers for reading records that contain "variable=value"
1530 struct variable_to_value_map
1532 struct substring buffer; /* Record contents. */
1533 size_t pos; /* Current position in buffer. */
1536 /* Reads SIZE bytes into a "variable=value" map for R,
1537 and returns the map. */
1538 static struct variable_to_value_map *
1539 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1541 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1542 char *buffer = pool_malloc (r->pool, size + 1);
1543 read_bytes (r, buffer, size);
1544 map->buffer = ss_buffer (buffer, size);
1549 /* Closes MAP and frees its storage.
1550 Not really needed, because the pool will free the map anyway,
1551 but can be used to free it earlier. */
1553 close_variable_to_value_map (struct sfm_reader *r,
1554 struct variable_to_value_map *map)
1556 pool_free (r->pool, ss_data (map->buffer));
1559 /* Reads the next variable=value pair from MAP.
1560 Looks up the variable in DICT and stores it into *VAR.
1561 Stores a null-terminated value into *VALUE. */
1563 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1564 struct variable_to_value_map *map,
1565 struct variable **var, char **value,
1568 int max_warnings = 5;
1572 struct substring short_name_ss, value_ss;
1574 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1575 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1578 if (*warning_cnt > max_warnings)
1579 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1580 *warning_cnt - max_warnings);
1584 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1585 ss_buffer ("\t\0", 2));
1587 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1588 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1591 if (++*warning_cnt <= max_warnings)
1592 sys_warn (r, _("Variable map refers to unknown variable %s."),
1593 ss_data (short_name_ss));
1597 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1598 *value = ss_data (value_ss);
1606 /* Displays a corruption message. */
1608 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1613 ds_init_empty (&text);
1614 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1615 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1616 ds_put_vformat (&text, format, args);
1618 m.category = msg_class_to_category (class);
1619 m.severity = msg_class_to_severity (class);
1620 m.where.file_name = NULL;
1621 m.where.line_number = 0;
1622 m.text = ds_cstr (&text);
1627 /* Displays a warning for the current file position. */
1629 sys_warn (struct sfm_reader *r, const char *format, ...)
1633 va_start (args, format);
1634 sys_msg (r, MW, format, args);
1638 /* Displays an error for the current file position,
1639 marks it as in an error state,
1640 and aborts reading it using longjmp. */
1642 sys_error (struct sfm_reader *r, const char *format, ...)
1646 va_start (args, format);
1647 sys_msg (r, ME, format, args);
1651 longjmp (r->bail_out, 1);
1654 /* Reads BYTE_CNT bytes into BUF.
1655 Returns true if exactly BYTE_CNT bytes are successfully read.
1656 Aborts if an I/O error or a partial read occurs.
1657 If EOF_IS_OK, then an immediate end-of-file causes false to be
1658 returned; otherwise, immediate end-of-file causes an abort
1661 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1662 void *buf, size_t byte_cnt)
1664 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1665 if (bytes_read == byte_cnt)
1667 else if (ferror (r->file))
1668 sys_error (r, _("System error: %s."), strerror (errno));
1669 else if (!eof_is_ok || bytes_read != 0)
1670 sys_error (r, _("Unexpected end of file."));
1675 /* Reads BYTE_CNT into BUF.
1676 Aborts upon I/O error or if end-of-file is encountered. */
1678 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1680 read_bytes_internal (r, false, buf, byte_cnt);
1683 /* Reads BYTE_CNT bytes into BUF.
1684 Returns true if exactly BYTE_CNT bytes are successfully read.
1685 Returns false if an immediate end-of-file is encountered.
1686 Aborts if an I/O error or a partial read occurs. */
1688 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1690 return read_bytes_internal (r, true, buf, byte_cnt);
1693 /* Reads a 32-bit signed integer from R and returns its value in
1696 read_int (struct sfm_reader *r)
1699 read_bytes (r, integer, sizeof integer);
1700 return integer_get (r->integer_format, integer, sizeof integer);
1703 /* Reads a 64-bit floating-point number from R and returns its
1704 value in host format. */
1706 read_float (struct sfm_reader *r)
1709 read_bytes (r, number, sizeof number);
1710 return float_get_double (r->float_format, number);
1713 /* Reads exactly SIZE - 1 bytes into BUFFER
1714 and stores a null byte into BUFFER[SIZE - 1]. */
1716 read_string (struct sfm_reader *r, char *buffer, size_t size)
1719 read_bytes (r, buffer, size - 1);
1720 buffer[size - 1] = '\0';
1723 /* Skips BYTES bytes forward in R. */
1725 skip_bytes (struct sfm_reader *r, size_t bytes)
1730 size_t chunk = MIN (sizeof buffer, bytes);
1731 read_bytes (r, buffer, chunk);
1736 static const struct casereader_class sys_file_casereader_class =
1738 sys_file_casereader_read,
1739 sys_file_casereader_destroy,