1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/alloc.h>
29 #include <libpspp/assertion.h>
30 #include <libpspp/message.h>
31 #include <libpspp/compiler.h>
32 #include <libpspp/magic.h>
33 #include <libpspp/misc.h>
34 #include <libpspp/pool.h>
35 #include <libpspp/str.h>
36 #include <libpspp/hash.h>
37 #include <libpspp/array.h>
39 #include <data/case.h>
40 #include <data/casereader-provider.h>
41 #include <data/casereader.h>
42 #include <data/dictionary.h>
43 #include <data/file-handle-def.h>
44 #include <data/file-name.h>
45 #include <data/format.h>
46 #include <data/missing-values.h>
47 #include <data/value-labels.h>
48 #include <data/variable.h>
49 #include <data/value.h>
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
72 size_t value_cnt; /* Number of "union value"s in struct case. */
75 enum integer_format integer_format; /* On-disk integer format. */
76 enum float_format float_format; /* On-disk floating point format. */
77 int oct_cnt; /* Number of 8-byte units per case. */
78 struct sfm_var *sfm_vars; /* Variables. */
79 size_t sfm_var_cnt; /* Number of variables. */
80 casenumber case_cnt; /* Number of cases */
81 bool has_long_var_names; /* File has a long variable name map */
84 bool compressed; /* File is compressed? */
85 double bias; /* Compression bias, usually 100.0. */
86 uint8_t opcodes[8]; /* Current block of opcodes. */
87 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
90 static struct casereader_class sys_file_casereader_class;
92 static bool close_reader (struct sfm_reader *);
94 static struct variable **make_var_by_value_idx (struct sfm_reader *,
96 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
100 static void sys_warn (struct sfm_reader *, const char *, ...)
101 PRINTF_FORMAT (2, 3);
103 static void sys_error (struct sfm_reader *, const char *, ...)
107 static void read_bytes (struct sfm_reader *, void *, size_t);
108 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
109 static int read_int (struct sfm_reader *);
110 static double read_float (struct sfm_reader *);
111 static void read_string (struct sfm_reader *, char *, size_t);
112 static void skip_bytes (struct sfm_reader *, size_t);
114 static struct variable_to_value_map *open_variable_to_value_map (
115 struct sfm_reader *, size_t size);
116 static void close_variable_to_value_map (struct sfm_reader *r,
117 struct variable_to_value_map *);
118 static bool read_variable_to_value_map (struct sfm_reader *,
120 struct variable_to_value_map *,
121 struct variable **var, char **value,
124 static bool close_reader (struct sfm_reader *r);
126 /* Dictionary reader. */
134 static void read_header (struct sfm_reader *, struct dictionary *,
135 int *weight_idx, int *claimed_oct_cnt,
136 struct sfm_read_info *);
137 static void read_variable_record (struct sfm_reader *, struct dictionary *,
138 int *format_warning_cnt);
139 static void parse_format_spec (struct sfm_reader *, unsigned int,
140 enum which_format, struct variable *,
141 int *format_warning_cnt);
142 static void setup_weight (struct sfm_reader *, int weight_idx,
143 struct variable **var_by_value_idx,
144 struct dictionary *);
145 static void read_documents (struct sfm_reader *, struct dictionary *);
146 static void read_value_labels (struct sfm_reader *, struct dictionary *,
147 struct variable **var_by_value_idx);
149 static void read_extension_record (struct sfm_reader *, struct dictionary *,
150 struct sfm_read_info *);
151 static void read_machine_integer_info (struct sfm_reader *,
152 size_t size, size_t count,
153 struct sfm_read_info *);
154 static void read_machine_float_info (struct sfm_reader *,
155 size_t size, size_t count);
156 static void read_display_parameters (struct sfm_reader *,
157 size_t size, size_t count,
158 struct dictionary *);
159 static void read_long_var_name_map (struct sfm_reader *,
160 size_t size, size_t count,
161 struct dictionary *);
162 static void read_long_string_map (struct sfm_reader *,
163 size_t size, size_t count,
164 struct dictionary *);
167 /* Opens the system file designated by file handle FH for
168 reading. Reads the system file's dictionary into *DICT.
169 If INFO is non-null, then it receives additional info about the
172 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
173 struct sfm_read_info *volatile info)
175 struct sfm_reader *volatile r = NULL;
176 struct variable **var_by_value_idx;
177 struct sfm_read_info local_info;
178 int format_warning_cnt = 0;
183 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
186 *dict = dict_create ();
188 /* Create and initialize reader. */
189 r = pool_create_container (struct sfm_reader, pool);
191 r->file = fn_open (fh_get_file_name (fh), "rb");
194 r->has_long_var_names = false;
195 r->opcode_idx = sizeof r->opcodes;
197 /* Initialize info. */
200 memset (info, 0, sizeof *info);
202 if (setjmp (r->bail_out))
205 dict_destroy (*dict);
212 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
213 fh_get_file_name (r->fh), strerror (errno));
214 longjmp (r->bail_out, 1);
218 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
220 /* Read all the variable definition records. */
221 rec_type = read_int (r);
222 while (rec_type == 2)
224 read_variable_record (r, *dict, &format_warning_cnt);
225 rec_type = read_int (r);
228 /* Figure out the case format. */
229 var_by_value_idx = make_var_by_value_idx (r, *dict);
230 setup_weight (r, weight_idx, var_by_value_idx, *dict);
232 /* Read all the rest of the dictionary records. */
233 while (rec_type != 999)
238 read_value_labels (r, *dict, var_by_value_idx);
242 sys_error (r, _("Misplaced type 4 record."));
245 read_documents (r, *dict);
249 read_extension_record (r, *dict, info);
253 sys_error (r, _("Unrecognized record type %d."), rec_type);
255 rec_type = read_int (r);
259 if ( ! r->has_long_var_names )
262 for (i = 0; i < dict_get_var_cnt (*dict); i++)
264 struct variable *var = dict_get_var (*dict, i);
265 char short_name [SHORT_NAME_LEN + 1];
266 char long_name [SHORT_NAME_LEN + 1];
268 strcpy (short_name, var_get_name (var));
270 strcpy (long_name, short_name);
271 str_lowercase (long_name);
273 /* Set long name. Renaming a variable may clear the short
274 name, but we want to retain it, so re-set it
276 dict_rename_var (*dict, var, long_name);
277 var_set_short_name (var, 0, short_name);
280 r->has_long_var_names = true;
283 /* Read record 999 data, which is just filler. */
286 /* Warn if the actual amount of data per case differs from the
287 amount that the header claims. SPSS version 13 gets this
288 wrong when very long strings are involved, so don't warn in
290 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
291 && info->version_major != 13)
292 sys_warn (r, _("File header claims %d variable positions but "
293 "%d were read from file."),
294 claimed_oct_cnt, r->oct_cnt);
296 /* Create an index of dictionary variable widths for
297 sfm_read_case to use. We cannot use the `struct variable's
298 from the dictionary we created, because the caller owns the
299 dictionary and may destroy or modify its variables. */
300 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
301 pool_register (r->pool, free, r->sfm_vars);
303 pool_free (r->pool, var_by_value_idx);
304 r->value_cnt = dict_get_next_value_idx (*dict);
305 return casereader_create_sequential
307 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
308 &sys_file_casereader_class, r);
311 /* Closes a system file after we're done with it.
312 Returns true if an I/O error has occurred on READER, false
315 close_reader (struct sfm_reader *r)
324 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
326 msg (ME, _("Error closing system file \"%s\": %s."),
327 fh_get_file_name (r->fh), strerror (errno));
334 fh_close (r->fh, "system file", "rs");
337 pool_destroy (r->pool);
342 /* Destroys READER. */
344 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
346 struct sfm_reader *r = r_;
350 /* Returns true if FILE is an SPSS system file,
353 sfm_detect (FILE *file)
357 if (fread (rec_type, 4, 1, file) != 1)
361 return !strcmp ("$FL2", rec_type);
364 /* Reads the global header of the system file.
365 Sets DICT's file label to the system file's label.
366 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
367 or to the value index of the weight variable otherwise.
368 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
369 per case that the file claims to have (although it is not
371 Initializes INFO with header information. */
373 read_header (struct sfm_reader *r, struct dictionary *dict,
374 int *weight_idx, int *claimed_oct_cnt,
375 struct sfm_read_info *info)
378 char eye_catcher[61];
379 uint8_t raw_layout_code[4];
381 char creation_date[10];
382 char creation_time[9];
384 struct substring file_label_ss;
385 struct substring product;
387 read_string (r, rec_type, sizeof rec_type);
388 read_string (r, eye_catcher, sizeof eye_catcher);
390 if (strcmp ("$FL2", rec_type) != 0)
391 sys_error (r, _("This is not an SPSS system file."));
393 /* Identify integer format. */
394 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
395 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
397 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
399 || (r->integer_format != INTEGER_MSB_FIRST
400 && r->integer_format != INTEGER_LSB_FIRST))
401 sys_error (r, _("This is not an SPSS system file."));
403 *claimed_oct_cnt = read_int (r);
404 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
405 *claimed_oct_cnt = -1;
407 r->compressed = read_int (r) != 0;
409 *weight_idx = read_int (r);
411 r->case_cnt = read_int (r);
412 if ( r->case_cnt > INT_MAX / 2)
416 /* Identify floating-point format and obtain compression bias. */
417 read_bytes (r, raw_bias, sizeof raw_bias);
418 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
420 sys_warn (r, _("Compression bias (%g) is not the usual "
421 "value of 100, or system file uses unrecognized "
422 "floating-point format."),
424 if (r->integer_format == INTEGER_MSB_FIRST)
425 r->float_format = FLOAT_IEEE_DOUBLE_BE;
427 r->float_format = FLOAT_IEEE_DOUBLE_LE;
429 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
431 read_string (r, creation_date, sizeof creation_date);
432 read_string (r, creation_time, sizeof creation_time);
433 read_string (r, file_label, sizeof file_label);
436 file_label_ss = ss_cstr (file_label);
437 ss_trim (&file_label_ss, ss_cstr (" "));
438 if (!ss_is_empty (file_label_ss))
440 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
441 dict_set_label (dict, ss_data (file_label_ss));
444 strcpy (info->creation_date, creation_date);
445 strcpy (info->creation_time, creation_time);
446 info->integer_format = r->integer_format;
447 info->float_format = r->float_format;
448 info->compressed = r->compressed;
449 info->case_cnt = r->case_cnt;
451 product = ss_cstr (eye_catcher);
452 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
453 ss_trim (&product, ss_cstr (" "));
454 str_copy_buf_trunc (info->product, sizeof info->product,
455 ss_data (product), ss_length (product));
458 /* Reads a variable (type 2) record from R and adds the
459 corresponding variable to DICT.
460 Also skips past additional variable records for long string
463 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
464 int *format_warning_cnt)
467 int has_variable_label;
468 int missing_value_code;
473 struct variable *var;
476 width = read_int (r);
477 has_variable_label = read_int (r);
478 missing_value_code = read_int (r);
479 print_format = read_int (r);
480 write_format = read_int (r);
481 read_string (r, name, sizeof name);
482 name[strcspn (name, " ")] = '\0';
484 /* Check variable name. */
485 if (name[0] == '$' || name[0] == '#')
486 sys_error (r, "Variable name begins with invalid character `%c'.",
488 if (!var_is_plausible_name (name, false))
489 sys_error (r, _("Invalid variable name `%s'."), name);
491 /* Create variable. */
492 if (width < 0 || width > 255)
493 sys_error (r, _("Bad variable width %d."), width);
494 var = dict_create_var (dict, name, width);
497 _("Duplicate variable name `%s' within system file."),
500 /* Set the short name the same as the long name. */
501 var_set_short_name (var, 0, var_get_name (var));
503 /* Get variable label, if any. */
504 if (has_variable_label != 0 && has_variable_label != 1)
505 sys_error (r, _("Variable label indicator field is not 0 or 1."));
506 if (has_variable_label == 1)
512 if (len >= sizeof label)
513 sys_error (r, _("Variable %s has label of invalid length %u."),
514 name, (unsigned int) len);
515 read_string (r, label, len + 1);
516 var_set_label (var, label);
518 skip_bytes (r, ROUND_UP (len, 4) - len);
521 /* Set missing values. */
522 if (missing_value_code != 0)
524 struct missing_values mv;
527 mv_init (&mv, var_get_width (var));
528 if (var_is_numeric (var))
530 if (missing_value_code < -3 || missing_value_code > 3
531 || missing_value_code == -1)
532 sys_error (r, _("Numeric missing value indicator field is not "
533 "-3, -2, 0, 1, 2, or 3."));
534 if (missing_value_code < 0)
536 double low = read_float (r);
537 double high = read_float (r);
538 mv_add_num_range (&mv, low, high);
539 missing_value_code = -missing_value_code - 2;
541 for (i = 0; i < missing_value_code; i++)
542 mv_add_num (&mv, read_float (r));
544 else if (var_get_width (var) <= MAX_SHORT_STRING)
546 if (missing_value_code < 1 || missing_value_code > 3)
547 sys_error (r, _("String missing value indicator field is not "
549 for (i = 0; i < missing_value_code; i++)
552 read_string (r, string, sizeof string);
553 mv_add_str (&mv, string);
557 sys_error (r, _("Long string variable %s may not have missing "
559 var_set_missing_values (var, &mv);
563 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
564 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
566 /* Account for values.
567 Skip long string continuation records, if any. */
568 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
574 for (i = 1; i < nv; i++)
576 /* Check for record type 2 and width -1. */
577 if (read_int (r) != 2 || read_int (r) != -1)
578 sys_error (r, _("Missing string continuation record."));
580 /* Skip and ignore remaining continuation data. */
581 has_variable_label = read_int (r);
582 missing_value_code = read_int (r);
583 print_format = read_int (r);
584 write_format = read_int (r);
585 read_string (r, name, sizeof name);
587 /* Variable label fields on continuation records have
588 been spotted in system files created by "SPSS Power
589 Macintosh Release 6.1". */
590 if (has_variable_label)
591 skip_bytes (r, ROUND_UP (read_int (r), 4));
596 /* Translates the format spec from sysfile format to internal
599 parse_format_spec (struct sfm_reader *r, unsigned int s,
600 enum which_format which, struct variable *v,
601 int *format_warning_cnt)
603 const int max_format_warnings = 8;
605 uint8_t raw_type = s >> 16;
611 if (!fmt_from_io (raw_type, &f.type))
612 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
617 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
622 if (which == PRINT_FORMAT)
623 var_set_print_format (v, &f);
625 var_set_write_format (v, &f);
627 else if (*++format_warning_cnt <= max_format_warnings)
629 char fmt_string[FMT_STRING_LEN_MAX + 1];
630 sys_warn (r, _("%s variable %s has invalid %s format %s."),
631 var_is_numeric (v) ? _("Numeric") : _("String"),
633 which == PRINT_FORMAT ? _("print") : _("write"),
634 fmt_to_string (&f, fmt_string));
636 if (*format_warning_cnt == max_format_warnings)
637 sys_warn (r, _("Suppressing further invalid format warnings."));
641 /* Sets the weighting variable in DICT to the variable
642 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
645 setup_weight (struct sfm_reader *r, int weight_idx,
646 struct variable **var_by_value_idx, struct dictionary *dict)
650 struct variable *weight_var
651 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
652 if (var_is_numeric (weight_var))
653 dict_set_weight (dict, weight_var);
655 sys_error (r, _("Weighting variable must be numeric."));
659 /* Reads a document record, type 6, from system file R, and sets up
660 the documents and n_documents fields in the associated
663 read_documents (struct sfm_reader *r, struct dictionary *dict)
668 if (dict_get_documents (dict) != NULL)
669 sys_error (r, _("Multiple type 6 (document) records."));
671 line_cnt = read_int (r);
673 sys_error (r, _("Number of document lines (%d) "
674 "must be greater than 0."), line_cnt);
676 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
677 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
678 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
679 dict_set_documents (dict, documents);
681 sys_error (r, _("Document line contains null byte."));
682 pool_free (r->pool, documents);
685 /* Read a type 7 extension record. */
687 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
688 struct sfm_read_info *info)
690 int subtype = read_int (r);
691 size_t size = read_int (r);
692 size_t count = read_int (r);
693 size_t bytes = size * count;
695 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
696 allows an extra byte for a null terminator, used by some
697 extension processing routines. */
698 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
699 sys_error (r, "Record type 7 subtype %d too large.", subtype);
704 read_machine_integer_info (r, size, count, info);
708 read_machine_float_info (r, size, count);
712 /* Variable sets information. We don't use these yet.
713 They only apply to GUIs; see VARSETS on the APPLY
714 DICTIONARY command in SPSS documentation. */
718 /* DATE variable information. We don't use it yet, but we
723 /* Unknown purpose. */
727 read_display_parameters (r, size, count, dict);
731 read_long_var_name_map (r, size, count, dict);
735 read_long_string_map (r, size, count, dict);
739 /* New in SPSS v14? Unknown purpose. */
743 /* Text field that defines variable attributes. New in
748 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
752 skip_bytes (r, bytes);
755 /* Read record type 7, subtype 3. */
757 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
758 struct sfm_read_info *info)
760 int version_major = read_int (r);
761 int version_minor = read_int (r);
762 int version_revision = read_int (r);
763 int machine_code UNUSED = read_int (r);
764 int float_representation = read_int (r);
765 int compression_code UNUSED = read_int (r);
766 int integer_representation = read_int (r);
767 int character_code UNUSED = read_int (r);
769 int expected_float_format;
770 int expected_integer_format;
772 if (size != 4 || count != 8)
773 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
775 (unsigned int) size, (unsigned int) count);
777 /* Save version info. */
778 info->version_major = version_major;
779 info->version_minor = version_minor;
780 info->version_revision = version_revision;
782 /* Check floating point format. */
783 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
784 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
785 expected_float_format = 1;
786 else if (r->float_format == FLOAT_Z_LONG)
787 expected_float_format = 2;
788 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
789 expected_float_format = 3;
792 if (float_representation != expected_float_format)
793 sys_error (r, _("Floating-point representation indicated by "
794 "system file (%d) differs from expected (%d)."),
795 r->float_format, expected_float_format);
797 /* Check integer format. */
798 if (r->integer_format == INTEGER_MSB_FIRST)
799 expected_integer_format = 1;
800 else if (r->integer_format == INTEGER_LSB_FIRST)
801 expected_integer_format = 2;
804 if (integer_representation != expected_integer_format)
806 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
807 sys_warn (r, _("Integer format indicated by system file (%s) "
808 "differs from expected (%s)."),
809 gettext (endian[integer_representation == 1]),
810 gettext (endian[expected_integer_format == 1]));
814 /* Read record type 7, subtype 4. */
816 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
818 double sysmis = read_float (r);
819 double highest = read_float (r);
820 double lowest = read_float (r);
822 if (size != 8 || count != 3)
823 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
824 (unsigned int) size, (unsigned int) count);
826 if (sysmis != SYSMIS)
827 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
828 if (highest != HIGHEST)
829 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
830 if (lowest != LOWEST)
831 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
834 /* Read record type 7, subtype 11, which specifies how variables
835 should be displayed in GUI environments. */
837 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
838 struct dictionary *dict)
840 const size_t n_vars = count / 3 ;
844 if (count % 3 || n_vars != dict_get_var_cnt (dict))
845 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
846 (unsigned int) size, (unsigned int) count);
848 for (i = 0; i < n_vars; ++i)
850 struct variable *v = dict_get_var (dict, i);
851 int measure = read_int (r);
852 int width = read_int (r);
853 int align = read_int (r);
855 /* SPSS 14 sometimes seems to set string variables' measure
857 if (0 == measure && var_is_alpha (v))
860 /* Older versions (SPSS 9.0) sometimes set the display width
861 to zero. This causes confusion especially in the GUI */
865 if (measure < 1 || measure > 3 || align < 0 || align > 2)
868 sys_warn (r, _("Invalid variable display parameters. "
869 "Default parameters substituted."));
874 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
875 : measure == 2 ? MEASURE_ORDINAL
877 var_set_display_width (v, width);
878 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
879 : align == 1 ? ALIGN_RIGHT
884 /* Reads record type 7, subtype 13, which gives the long name
885 that corresponds to each short name. Modifies variable names
886 in DICT accordingly. */
888 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
889 struct dictionary *dict)
891 struct variable_to_value_map *map;
892 struct variable *var;
896 map = open_variable_to_value_map (r, size * count);
897 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
901 size_t short_name_cnt;
904 /* Validate long name. */
905 if (!var_is_valid_name (long_name, false))
907 sys_warn (r, _("Long variable mapping from %s to invalid "
908 "variable name `%s'."),
909 var_get_name (var), long_name);
913 /* Identify any duplicates. */
914 if (strcasecmp (var_get_short_name (var, 0), long_name)
915 && dict_lookup_var (dict, long_name) != NULL)
917 sys_warn (r, _("Duplicate long variable name `%s' "
918 "within system file."), long_name);
922 /* Renaming a variable may clear its short names, but we
923 want to retain them, so we save them and re-set them
925 short_name_cnt = var_get_short_name_cnt (var);
926 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
927 for (i = 0; i < short_name_cnt; i++)
929 const char *s = var_get_short_name (var, i);
930 short_names[i] = s != NULL ? xstrdup (s) : NULL;
934 dict_rename_var (dict, var, long_name);
936 /* Restore short names. */
937 for (i = 0; i < short_name_cnt; i++)
939 var_set_short_name (var, i, short_names[i]);
940 free (short_names[i]);
944 close_variable_to_value_map (r, map);
945 r->has_long_var_names = true;
948 /* Reads record type 7, subtype 14, which gives the real length
949 of each very long string. Rearranges DICT accordingly. */
951 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
952 struct dictionary *dict)
954 struct variable_to_value_map *map;
955 struct variable *var;
959 map = open_variable_to_value_map (r, size * count);
960 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
963 size_t idx = var_get_dict_index (var);
969 length = strtol (length_s, NULL, 10);
970 if (length < 1 || length > MAX_STRING)
972 sys_warn (r, _("%s listed as string of invalid length %s "
973 "in very length string record."),
974 var_get_name (var), length_s);
978 /* Check segments. */
979 segment_cnt = sfm_width_to_segments (length);
980 if (segment_cnt == 1)
982 sys_warn (r, _("%s listed in very long string record with width %s, "
983 "which requires only one segment."),
984 var_get_name (var), length_s);
987 if (idx + segment_cnt > dict_get_var_cnt (dict))
988 sys_error (r, _("Very long string %s overflows dictionary."),
991 /* Get the short names from the segments and check their
993 for (i = 0; i < segment_cnt; i++)
995 struct variable *seg = dict_get_var (dict, idx + i);
996 int alloc_width = sfm_segment_alloc_width (length, i);
997 int width = var_get_width (seg);
1000 var_set_short_name (var, i, var_get_short_name (seg, 0));
1001 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1002 sys_error (r, _("Very long string with width %ld has segment %d "
1003 "of width %d (expected %d)"),
1004 length, i, width, alloc_width);
1006 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1007 var_set_width (var, length);
1009 close_variable_to_value_map (r, map);
1010 dict_compact_values (dict);
1013 /* Reads value labels from sysfile H and inserts them into the
1014 associated dictionary. */
1016 read_value_labels (struct sfm_reader *r,
1017 struct dictionary *dict, struct variable **var_by_value_idx)
1019 struct pool *subpool;
1023 char raw_value[8]; /* Value as uninterpreted bytes. */
1024 union value value; /* Value. */
1025 char *label; /* Null-terminated label string. */
1028 struct label *labels = NULL;
1029 int label_cnt; /* Number of labels. */
1031 struct variable **var = NULL; /* Associated variables. */
1032 int var_cnt; /* Number of associated variables. */
1036 subpool = pool_create_subpool (r->pool);
1038 /* Read the type 3 record and record its contents. We can't do
1039 much with the data yet because we don't know whether it is
1040 of numeric or string type. */
1042 /* Read number of labels. */
1043 label_cnt = read_int (r);
1045 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1047 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1052 /* Read each value/label tuple into labels[]. */
1053 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1054 for (i = 0; i < label_cnt; i++)
1056 struct label *label = labels + i;
1057 unsigned char label_len;
1061 read_bytes (r, label->raw_value, sizeof label->raw_value);
1063 /* Read label length. */
1064 read_bytes (r, &label_len, sizeof label_len);
1065 padded_len = ROUND_UP (label_len + 1, 8);
1067 /* Read label, padding. */
1068 label->label = pool_alloc (subpool, padded_len + 1);
1069 read_bytes (r, label->label, padded_len - 1);
1070 label->label[label_len] = 0;
1073 /* Now, read the type 4 record that has the list of variables
1074 to which the value labels are to be applied. */
1076 /* Read record type of type 4 record. */
1077 if (read_int (r) != 4)
1078 sys_error (r, _("Variable index record (type 4) does not immediately "
1079 "follow value label record (type 3) as it should."));
1081 /* Read number of variables associated with value label from type 4
1083 var_cnt = read_int (r);
1084 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1085 sys_error (r, _("Number of variables associated with a value label (%d) "
1086 "is not between 1 and the number of variables (%u)."),
1087 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1089 /* Read the list of variables. */
1090 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1091 for (i = 0; i < var_cnt; i++)
1093 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1094 if (var_is_long_string (var[i]))
1095 sys_error (r, _("Value labels are not allowed on long string "
1096 "variables (%s)."), var_get_name (var[i]));
1099 /* Type check the variables. */
1100 for (i = 1; i < var_cnt; i++)
1101 if (var_get_type (var[i]) != var_get_type (var[0]))
1102 sys_error (r, _("Variables associated with value label are not all of "
1103 "identical type. Variable %s is %s, but variable "
1105 var_get_name (var[0]),
1106 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1107 var_get_name (var[i]),
1108 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1110 /* Fill in labels[].value, now that we know the desired type. */
1111 for (i = 0; i < label_cnt; i++)
1113 struct label *label = labels + i;
1115 if (var_is_alpha (var[0]))
1116 buf_copy_rpad (label->value.s, sizeof label->value.s,
1117 label->raw_value, sizeof label->raw_value);
1119 label->value.f = float_get_double (r->float_format, label->raw_value);
1122 /* Assign the `value_label's to each variable. */
1123 for (i = 0; i < var_cnt; i++)
1125 struct variable *v = var[i];
1128 /* Add each label to the variable. */
1129 for (j = 0; j < label_cnt; j++)
1131 struct label *label = &labels[j];
1132 if (!var_add_value_label (v, &label->value, label->label))
1134 if (var_is_numeric (var[0]))
1135 sys_warn (r, _("Duplicate value label for %g on %s."),
1136 label->value.f, var_get_name (v));
1138 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1139 var_get_width (v), label->value.s,
1145 pool_destroy (subpool);
1150 static void partial_record (struct sfm_reader *r)
1153 static void read_error (struct casereader *, const struct sfm_reader *);
1155 static bool read_case_number (struct sfm_reader *, double *);
1156 static bool read_case_string (struct sfm_reader *, char *, size_t);
1157 static int read_opcode (struct sfm_reader *);
1158 static bool read_compressed_number (struct sfm_reader *, double *);
1159 static bool read_compressed_string (struct sfm_reader *, char *);
1160 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1161 static bool skip_whole_strings (struct sfm_reader *, size_t);
1163 /* Reads one case from READER's file into C. Returns true only
1166 sys_file_casereader_read (struct casereader *reader, void *r_,
1169 struct sfm_reader *r = r_;
1175 case_create (c, r->value_cnt);
1176 if (setjmp (r->bail_out))
1178 casereader_force_error (reader);
1183 for (i = 0; i < r->sfm_var_cnt; i++)
1185 struct sfm_var *sv = &r->sfm_vars[i];
1186 union value *v = case_data_rw_idx (c, sv->case_index);
1190 if (!read_case_number (r, &v->f))
1195 if (!read_case_string (r, v->s + sv->offset, sv->width))
1197 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1207 if (r->case_cnt != -1)
1208 read_error (reader, r);
1212 /* Issues an error that R ends in a partial record. */
1214 partial_record (struct sfm_reader *r)
1216 sys_error (r, _("File ends in partial case."));
1219 /* Issues an error that an unspecified error occurred SFM, and
1222 read_error (struct casereader *r, const struct sfm_reader *sfm)
1224 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1225 casereader_force_error (r);
1228 /* Reads a number from R and stores its value in *D.
1229 If R is compressed, reads a compressed number;
1230 otherwise, reads a number in the regular way.
1231 Returns true if successful, false if end of file is
1232 reached immediately. */
1234 read_case_number (struct sfm_reader *r, double *d)
1239 if (!try_read_bytes (r, number, sizeof number))
1241 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1245 return read_compressed_number (r, d);
1248 /* Reads LENGTH string bytes from R into S.
1249 Always reads a multiple of 8 bytes; if LENGTH is not a
1250 multiple of 8, then extra bytes are read and discarded without
1252 Reads compressed strings if S is compressed.
1253 Returns true if successful, false if end of file is
1254 reached immediately. */
1256 read_case_string (struct sfm_reader *r, char *s, size_t length)
1258 size_t whole = ROUND_DOWN (length, 8);
1259 size_t partial = length % 8;
1263 if (!read_whole_strings (r, s, whole))
1270 if (!read_whole_strings (r, bounce, sizeof bounce))
1276 memcpy (s + whole, bounce, partial);
1282 /* Reads and returns the next compression opcode from R. */
1284 read_opcode (struct sfm_reader *r)
1286 assert (r->compressed);
1290 if (r->opcode_idx >= sizeof r->opcodes)
1292 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1296 opcode = r->opcodes[r->opcode_idx++];
1303 /* Reads a compressed number from R and stores its value in D.
1304 Returns true if successful, false if end of file is
1305 reached immediately. */
1307 read_compressed_number (struct sfm_reader *r, double *d)
1309 int opcode = read_opcode (r);
1317 *d = read_float (r);
1321 sys_error (r, _("Compressed data is corrupt."));
1328 *d = opcode - r->bias;
1335 /* Reads a compressed 8-byte string segment from R and stores it
1337 Returns true if successful, false if end of file is
1338 reached immediately. */
1340 read_compressed_string (struct sfm_reader *r, char *dst)
1342 switch (read_opcode (r))
1349 read_bytes (r, dst, 8);
1353 memset (dst, ' ', 8);
1357 sys_error (r, _("Compressed data is corrupt."));
1363 /* Reads LENGTH string bytes from R into S.
1364 LENGTH must be a multiple of 8.
1365 Reads compressed strings if S is compressed.
1366 Returns true if successful, false if end of file is
1367 reached immediately. */
1369 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1371 assert (length % 8 == 0);
1373 return try_read_bytes (r, s, length);
1377 for (ofs = 0; ofs < length; ofs += 8)
1378 if (!read_compressed_string (r, s + ofs))
1388 /* Skips LENGTH string bytes from R.
1389 LENGTH must be a multiple of 8.
1390 (LENGTH is also limited to 1024, but that's only because the
1391 current caller never needs more than that many bytes.)
1392 Returns true if successful, false if end of file is
1393 reached immediately. */
1395 skip_whole_strings (struct sfm_reader *r, size_t length)
1398 assert (length < sizeof buffer);
1399 return read_whole_strings (r, buffer, length);
1402 /* Creates and returns a table that can be used for translating a value
1403 index into a case to a "struct variable *" for DICT. Multiple
1404 system file fields reference variables this way.
1406 This table must be created before processing the very long
1407 string extension record, because that record causes some
1408 values to be deleted from the case and the dictionary to be
1410 static struct variable **
1411 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1413 struct variable **var_by_value_idx;
1417 var_by_value_idx = pool_nmalloc (r->pool,
1418 r->oct_cnt, sizeof *var_by_value_idx);
1419 for (i = 0; i < dict_get_var_cnt (dict); i++)
1421 struct variable *v = dict_get_var (dict, i);
1422 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1425 var_by_value_idx[value_idx++] = v;
1426 for (j = 1; j < nv; j++)
1427 var_by_value_idx[value_idx++] = NULL;
1429 assert (value_idx == r->oct_cnt);
1431 return var_by_value_idx;
1434 /* Returns the "struct variable" corresponding to the given
1435 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1437 static struct variable *
1438 lookup_var_by_value_idx (struct sfm_reader *r,
1439 struct variable **var_by_value_idx, int value_idx)
1441 struct variable *var;
1443 if (value_idx < 1 || value_idx > r->oct_cnt)
1444 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1445 value_idx, r->oct_cnt);
1447 var = var_by_value_idx[value_idx - 1];
1449 sys_error (r, _("Variable index %d refers to long string "
1456 /* Returns the variable in D with the given SHORT_NAME,
1457 or a null pointer if there is none. */
1458 static struct variable *
1459 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1461 struct variable *var;
1465 /* First try looking up by full name. This often succeeds. */
1466 var = dict_lookup_var (d, short_name);
1467 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
1470 /* Iterate through the whole dictionary as a fallback. */
1471 var_cnt = dict_get_var_cnt (d);
1472 for (i = 0; i < var_cnt; i++)
1474 var = dict_get_var (d, i);
1475 if (!strcasecmp (var_get_short_name (var, 0), short_name))
1482 /* Helpers for reading records that contain "variable=value"
1486 struct variable_to_value_map
1488 struct substring buffer; /* Record contents. */
1489 size_t pos; /* Current position in buffer. */
1492 /* Reads SIZE bytes into a "variable=value" map for R,
1493 and returns the map. */
1494 static struct variable_to_value_map *
1495 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1497 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1498 char *buffer = pool_malloc (r->pool, size + 1);
1499 read_bytes (r, buffer, size);
1500 map->buffer = ss_buffer (buffer, size);
1505 /* Closes MAP and frees its storage.
1506 Not really needed, because the pool will free the map anyway,
1507 but can be used to free it earlier. */
1509 close_variable_to_value_map (struct sfm_reader *r,
1510 struct variable_to_value_map *map)
1512 pool_free (r->pool, ss_data (map->buffer));
1515 /* Reads the next variable=value pair from MAP.
1516 Looks up the variable in DICT and stores it into *VAR.
1517 Stores a null-terminated value into *VALUE. */
1519 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1520 struct variable_to_value_map *map,
1521 struct variable **var, char **value,
1524 int max_warnings = 5;
1528 struct substring short_name_ss, value_ss;
1530 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1531 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1534 if (*warning_cnt > max_warnings)
1535 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1536 *warning_cnt - max_warnings);
1540 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1541 ss_buffer ("\t\0", 2));
1543 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1544 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1547 if (++*warning_cnt <= max_warnings)
1548 sys_warn (r, _("Variable map refers to unknown variable %s."),
1549 ss_data (short_name_ss));
1553 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1554 *value = ss_data (value_ss);
1562 /* Displays a corruption message. */
1564 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1569 ds_init_empty (&text);
1570 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1571 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1572 ds_put_vformat (&text, format, args);
1574 m.category = msg_class_to_category (class);
1575 m.severity = msg_class_to_severity (class);
1576 m.where.file_name = NULL;
1577 m.where.line_number = 0;
1578 m.text = ds_cstr (&text);
1583 /* Displays a warning for the current file position. */
1585 sys_warn (struct sfm_reader *r, const char *format, ...)
1589 va_start (args, format);
1590 sys_msg (r, MW, format, args);
1594 /* Displays an error for the current file position,
1595 marks it as in an error state,
1596 and aborts reading it using longjmp. */
1598 sys_error (struct sfm_reader *r, const char *format, ...)
1602 va_start (args, format);
1603 sys_msg (r, ME, format, args);
1607 longjmp (r->bail_out, 1);
1610 /* Reads BYTE_CNT bytes into BUF.
1611 Returns true if exactly BYTE_CNT bytes are successfully read.
1612 Aborts if an I/O error or a partial read occurs.
1613 If EOF_IS_OK, then an immediate end-of-file causes false to be
1614 returned; otherwise, immediate end-of-file causes an abort
1617 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1618 void *buf, size_t byte_cnt)
1620 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1621 if (bytes_read == byte_cnt)
1623 else if (ferror (r->file))
1624 sys_error (r, _("System error: %s."), strerror (errno));
1625 else if (!eof_is_ok || bytes_read != 0)
1626 sys_error (r, _("Unexpected end of file."));
1631 /* Reads BYTE_CNT into BUF.
1632 Aborts upon I/O error or if end-of-file is encountered. */
1634 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1636 read_bytes_internal (r, false, buf, byte_cnt);
1639 /* Reads BYTE_CNT bytes into BUF.
1640 Returns true if exactly BYTE_CNT bytes are successfully read.
1641 Returns false if an immediate end-of-file is encountered.
1642 Aborts if an I/O error or a partial read occurs. */
1644 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1646 return read_bytes_internal (r, true, buf, byte_cnt);
1649 /* Reads a 32-bit signed integer from R and returns its value in
1652 read_int (struct sfm_reader *r)
1655 read_bytes (r, integer, sizeof integer);
1656 return integer_get (r->integer_format, integer, sizeof integer);
1659 /* Reads a 64-bit floating-point number from R and returns its
1660 value in host format. */
1662 read_float (struct sfm_reader *r)
1665 read_bytes (r, number, sizeof number);
1666 return float_get_double (r->float_format, number);
1669 /* Reads exactly SIZE - 1 bytes into BUFFER
1670 and stores a null byte into BUFFER[SIZE - 1]. */
1672 read_string (struct sfm_reader *r, char *buffer, size_t size)
1675 read_bytes (r, buffer, size - 1);
1676 buffer[size - 1] = '\0';
1679 /* Skips BYTES bytes forward in R. */
1681 skip_bytes (struct sfm_reader *r, size_t bytes)
1686 size_t chunk = MIN (sizeof buffer, bytes);
1687 read_bytes (r, buffer, chunk);
1692 static struct casereader_class sys_file_casereader_class =
1694 sys_file_casereader_read,
1695 sys_file_casereader_destroy,