1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/assertion.h>
29 #include <libpspp/message.h>
30 #include <libpspp/compiler.h>
31 #include <libpspp/misc.h>
32 #include <libpspp/pool.h>
33 #include <libpspp/str.h>
34 #include <libpspp/hash.h>
35 #include <libpspp/array.h>
37 #include <data/case.h>
38 #include <data/casereader-provider.h>
39 #include <data/casereader.h>
40 #include <data/dictionary.h>
41 #include <data/file-handle-def.h>
42 #include <data/file-name.h>
43 #include <data/format.h>
44 #include <data/missing-values.h>
45 #include <data/value-labels.h>
46 #include <data/variable.h>
47 #include <data/value.h>
52 #include "unlocked-io.h"
57 #define _(msgid) gettext (msgid)
58 #define N_(msgid) (msgid)
60 /* System file reader. */
63 /* Resource tracking. */
64 struct pool *pool; /* All system file state. */
65 jmp_buf bail_out; /* longjmp() target for error handling. */
68 struct file_handle *fh; /* File handle. */
69 struct fh_lock *lock; /* Mutual exclusion for file handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
72 size_t value_cnt; /* Number of "union value"s in struct case. */
75 enum integer_format integer_format; /* On-disk integer format. */
76 enum float_format float_format; /* On-disk floating point format. */
77 int oct_cnt; /* Number of 8-byte units per case. */
78 struct sfm_var *sfm_vars; /* Variables. */
79 size_t sfm_var_cnt; /* Number of variables. */
80 casenumber case_cnt; /* Number of cases */
81 bool has_long_var_names; /* File has a long variable name map */
84 bool compressed; /* File is compressed? */
85 double bias; /* Compression bias, usually 100.0. */
86 uint8_t opcodes[8]; /* Current block of opcodes. */
87 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
90 static struct casereader_class sys_file_casereader_class;
92 static bool close_reader (struct sfm_reader *);
94 static struct variable **make_var_by_value_idx (struct sfm_reader *,
96 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
100 static void sys_warn (struct sfm_reader *, const char *, ...)
101 PRINTF_FORMAT (2, 3);
103 static void sys_error (struct sfm_reader *, const char *, ...)
107 static void read_bytes (struct sfm_reader *, void *, size_t);
108 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
109 static int read_int (struct sfm_reader *);
110 static double read_float (struct sfm_reader *);
111 static void read_string (struct sfm_reader *, char *, size_t);
112 static void skip_bytes (struct sfm_reader *, size_t);
114 static struct variable_to_value_map *open_variable_to_value_map (
115 struct sfm_reader *, size_t size);
116 static void close_variable_to_value_map (struct sfm_reader *r,
117 struct variable_to_value_map *);
118 static bool read_variable_to_value_map (struct sfm_reader *,
120 struct variable_to_value_map *,
121 struct variable **var, char **value,
124 static bool close_reader (struct sfm_reader *r);
126 /* Dictionary reader. */
134 static void read_header (struct sfm_reader *, struct dictionary *,
135 int *weight_idx, int *claimed_oct_cnt,
136 struct sfm_read_info *);
137 static void read_variable_record (struct sfm_reader *, struct dictionary *,
138 int *format_warning_cnt);
139 static void parse_format_spec (struct sfm_reader *, unsigned int,
140 enum which_format, struct variable *,
141 int *format_warning_cnt);
142 static void setup_weight (struct sfm_reader *, int weight_idx,
143 struct variable **var_by_value_idx,
144 struct dictionary *);
145 static void read_documents (struct sfm_reader *, struct dictionary *);
146 static void read_value_labels (struct sfm_reader *, struct dictionary *,
147 struct variable **var_by_value_idx);
149 static void read_extension_record (struct sfm_reader *, struct dictionary *,
150 struct sfm_read_info *);
151 static void read_machine_integer_info (struct sfm_reader *,
152 size_t size, size_t count,
153 struct sfm_read_info *);
154 static void read_machine_float_info (struct sfm_reader *,
155 size_t size, size_t count);
156 static void read_display_parameters (struct sfm_reader *,
157 size_t size, size_t count,
158 struct dictionary *);
159 static void read_long_var_name_map (struct sfm_reader *,
160 size_t size, size_t count,
161 struct dictionary *);
162 static void read_long_string_map (struct sfm_reader *,
163 size_t size, size_t count,
164 struct dictionary *);
167 /* Opens the system file designated by file handle FH for
168 reading. Reads the system file's dictionary into *DICT.
169 If INFO is non-null, then it receives additional info about the
172 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
173 struct sfm_read_info *volatile info)
175 struct sfm_reader *volatile r = NULL;
176 struct variable **var_by_value_idx;
177 struct sfm_read_info local_info;
178 int format_warning_cnt = 0;
183 *dict = dict_create ();
185 /* Create and initialize reader. */
186 r = pool_create_container (struct sfm_reader, pool);
192 r->has_long_var_names = false;
193 r->opcode_idx = sizeof r->opcodes;
195 r->lock = fh_lock (fh, FH_REF_FILE, "system file", FH_ACC_READ, false);
199 r->file = fn_open (fh_get_file_name (fh), "rb");
202 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
203 fh_get_file_name (r->fh), strerror (errno));
207 /* Initialize info. */
210 memset (info, 0, sizeof *info);
212 if (setjmp (r->bail_out))
217 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
219 /* Read all the variable definition records. */
220 rec_type = read_int (r);
221 while (rec_type == 2)
223 read_variable_record (r, *dict, &format_warning_cnt);
224 rec_type = read_int (r);
227 /* Figure out the case format. */
228 var_by_value_idx = make_var_by_value_idx (r, *dict);
229 setup_weight (r, weight_idx, var_by_value_idx, *dict);
231 /* Read all the rest of the dictionary records. */
232 while (rec_type != 999)
237 read_value_labels (r, *dict, var_by_value_idx);
241 sys_error (r, _("Misplaced type 4 record."));
244 read_documents (r, *dict);
248 read_extension_record (r, *dict, info);
252 sys_error (r, _("Unrecognized record type %d."), rec_type);
254 rec_type = read_int (r);
258 if ( ! r->has_long_var_names )
261 for (i = 0; i < dict_get_var_cnt (*dict); i++)
263 struct variable *var = dict_get_var (*dict, i);
264 char short_name [SHORT_NAME_LEN + 1];
265 char long_name [SHORT_NAME_LEN + 1];
267 strcpy (short_name, var_get_name (var));
269 strcpy (long_name, short_name);
270 str_lowercase (long_name);
272 /* Set long name. Renaming a variable may clear the short
273 name, but we want to retain it, so re-set it
275 dict_rename_var (*dict, var, long_name);
276 var_set_short_name (var, 0, short_name);
279 r->has_long_var_names = true;
282 /* Read record 999 data, which is just filler. */
285 /* Warn if the actual amount of data per case differs from the
286 amount that the header claims. SPSS version 13 gets this
287 wrong when very long strings are involved, so don't warn in
289 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
290 && info->version_major != 13)
291 sys_warn (r, _("File header claims %d variable positions but "
292 "%d were read from file."),
293 claimed_oct_cnt, r->oct_cnt);
295 /* Create an index of dictionary variable widths for
296 sfm_read_case to use. We cannot use the `struct variable's
297 from the dictionary we created, because the caller owns the
298 dictionary and may destroy or modify its variables. */
299 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
300 pool_register (r->pool, free, r->sfm_vars);
302 pool_free (r->pool, var_by_value_idx);
303 r->value_cnt = dict_get_next_value_idx (*dict);
304 return casereader_create_sequential
306 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
307 &sys_file_casereader_class, r);
311 dict_destroy (*dict);
316 /* Closes a system file after we're done with it.
317 Returns true if an I/O error has occurred on READER, false
320 close_reader (struct sfm_reader *r)
329 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
331 msg (ME, _("Error closing system file \"%s\": %s."),
332 fh_get_file_name (r->fh), strerror (errno));
342 pool_destroy (r->pool);
347 /* Destroys READER. */
349 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
351 struct sfm_reader *r = r_;
355 /* Returns true if FILE is an SPSS system file,
358 sfm_detect (FILE *file)
362 if (fread (rec_type, 4, 1, file) != 1)
366 return !strcmp ("$FL2", rec_type);
369 /* Reads the global header of the system file.
370 Sets DICT's file label to the system file's label.
371 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
372 or to the value index of the weight variable otherwise.
373 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
374 per case that the file claims to have (although it is not
376 Initializes INFO with header information. */
378 read_header (struct sfm_reader *r, struct dictionary *dict,
379 int *weight_idx, int *claimed_oct_cnt,
380 struct sfm_read_info *info)
383 char eye_catcher[61];
384 uint8_t raw_layout_code[4];
386 char creation_date[10];
387 char creation_time[9];
389 struct substring file_label_ss;
390 struct substring product;
392 read_string (r, rec_type, sizeof rec_type);
393 read_string (r, eye_catcher, sizeof eye_catcher);
395 if (strcmp ("$FL2", rec_type) != 0)
396 sys_error (r, _("This is not an SPSS system file."));
398 /* Identify integer format. */
399 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
400 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
402 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
404 || (r->integer_format != INTEGER_MSB_FIRST
405 && r->integer_format != INTEGER_LSB_FIRST))
406 sys_error (r, _("This is not an SPSS system file."));
408 *claimed_oct_cnt = read_int (r);
409 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
410 *claimed_oct_cnt = -1;
412 r->compressed = read_int (r) != 0;
414 *weight_idx = read_int (r);
416 r->case_cnt = read_int (r);
417 if ( r->case_cnt > INT_MAX / 2)
421 /* Identify floating-point format and obtain compression bias. */
422 read_bytes (r, raw_bias, sizeof raw_bias);
423 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
425 sys_warn (r, _("Compression bias (%g) is not the usual "
426 "value of 100, or system file uses unrecognized "
427 "floating-point format."),
429 if (r->integer_format == INTEGER_MSB_FIRST)
430 r->float_format = FLOAT_IEEE_DOUBLE_BE;
432 r->float_format = FLOAT_IEEE_DOUBLE_LE;
434 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
436 read_string (r, creation_date, sizeof creation_date);
437 read_string (r, creation_time, sizeof creation_time);
438 read_string (r, file_label, sizeof file_label);
441 file_label_ss = ss_cstr (file_label);
442 ss_trim (&file_label_ss, ss_cstr (" "));
443 if (!ss_is_empty (file_label_ss))
445 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
446 dict_set_label (dict, ss_data (file_label_ss));
449 strcpy (info->creation_date, creation_date);
450 strcpy (info->creation_time, creation_time);
451 info->integer_format = r->integer_format;
452 info->float_format = r->float_format;
453 info->compressed = r->compressed;
454 info->case_cnt = r->case_cnt;
456 product = ss_cstr (eye_catcher);
457 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
458 ss_trim (&product, ss_cstr (" "));
459 str_copy_buf_trunc (info->product, sizeof info->product,
460 ss_data (product), ss_length (product));
463 /* Reads a variable (type 2) record from R and adds the
464 corresponding variable to DICT.
465 Also skips past additional variable records for long string
468 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
469 int *format_warning_cnt)
472 int has_variable_label;
473 int missing_value_code;
478 struct variable *var;
481 width = read_int (r);
482 has_variable_label = read_int (r);
483 missing_value_code = read_int (r);
484 print_format = read_int (r);
485 write_format = read_int (r);
486 read_string (r, name, sizeof name);
487 name[strcspn (name, " ")] = '\0';
489 /* Check variable name. */
490 if (name[0] == '$' || name[0] == '#')
491 sys_error (r, "Variable name begins with invalid character `%c'.",
493 if (!var_is_plausible_name (name, false))
494 sys_error (r, _("Invalid variable name `%s'."), name);
496 /* Create variable. */
497 if (width < 0 || width > 255)
498 sys_error (r, _("Bad variable width %d."), width);
499 var = dict_create_var (dict, name, width);
502 _("Duplicate variable name `%s' within system file."),
505 /* Set the short name the same as the long name. */
506 var_set_short_name (var, 0, var_get_name (var));
508 /* Get variable label, if any. */
509 if (has_variable_label != 0 && has_variable_label != 1)
510 sys_error (r, _("Variable label indicator field is not 0 or 1."));
511 if (has_variable_label == 1)
517 if (len >= sizeof label)
518 sys_error (r, _("Variable %s has label of invalid length %zu."),
520 read_string (r, label, len + 1);
521 var_set_label (var, label);
523 skip_bytes (r, ROUND_UP (len, 4) - len);
526 /* Set missing values. */
527 if (missing_value_code != 0)
529 struct missing_values mv;
532 mv_init (&mv, var_get_width (var));
533 if (var_is_numeric (var))
535 if (missing_value_code < -3 || missing_value_code > 3
536 || missing_value_code == -1)
537 sys_error (r, _("Numeric missing value indicator field is not "
538 "-3, -2, 0, 1, 2, or 3."));
539 if (missing_value_code < 0)
541 double low = read_float (r);
542 double high = read_float (r);
543 mv_add_num_range (&mv, low, high);
544 missing_value_code = -missing_value_code - 2;
546 for (i = 0; i < missing_value_code; i++)
547 mv_add_num (&mv, read_float (r));
549 else if (var_get_width (var) <= MAX_SHORT_STRING)
551 if (missing_value_code < 1 || missing_value_code > 3)
552 sys_error (r, _("String missing value indicator field is not "
554 for (i = 0; i < missing_value_code; i++)
557 read_string (r, string, sizeof string);
558 mv_add_str (&mv, string);
562 sys_error (r, _("Long string variable %s may not have missing "
564 var_set_missing_values (var, &mv);
568 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
569 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
571 /* Account for values.
572 Skip long string continuation records, if any. */
573 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
579 for (i = 1; i < nv; i++)
581 /* Check for record type 2 and width -1. */
582 if (read_int (r) != 2 || read_int (r) != -1)
583 sys_error (r, _("Missing string continuation record."));
585 /* Skip and ignore remaining continuation data. */
586 has_variable_label = read_int (r);
587 missing_value_code = read_int (r);
588 print_format = read_int (r);
589 write_format = read_int (r);
590 read_string (r, name, sizeof name);
592 /* Variable label fields on continuation records have
593 been spotted in system files created by "SPSS Power
594 Macintosh Release 6.1". */
595 if (has_variable_label)
596 skip_bytes (r, ROUND_UP (read_int (r), 4));
601 /* Translates the format spec from sysfile format to internal
604 parse_format_spec (struct sfm_reader *r, unsigned int s,
605 enum which_format which, struct variable *v,
606 int *format_warning_cnt)
608 const int max_format_warnings = 8;
610 uint8_t raw_type = s >> 16;
616 if (!fmt_from_io (raw_type, &f.type))
617 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
622 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
627 if (which == PRINT_FORMAT)
628 var_set_print_format (v, &f);
630 var_set_write_format (v, &f);
632 else if (*++format_warning_cnt <= max_format_warnings)
634 char fmt_string[FMT_STRING_LEN_MAX + 1];
635 sys_warn (r, _("%s variable %s has invalid %s format %s."),
636 var_is_numeric (v) ? _("Numeric") : _("String"),
638 which == PRINT_FORMAT ? _("print") : _("write"),
639 fmt_to_string (&f, fmt_string));
641 if (*format_warning_cnt == max_format_warnings)
642 sys_warn (r, _("Suppressing further invalid format warnings."));
646 /* Sets the weighting variable in DICT to the variable
647 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
650 setup_weight (struct sfm_reader *r, int weight_idx,
651 struct variable **var_by_value_idx, struct dictionary *dict)
655 struct variable *weight_var
656 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
657 if (var_is_numeric (weight_var))
658 dict_set_weight (dict, weight_var);
660 sys_error (r, _("Weighting variable must be numeric."));
664 /* Reads a document record, type 6, from system file R, and sets up
665 the documents and n_documents fields in the associated
668 read_documents (struct sfm_reader *r, struct dictionary *dict)
673 if (dict_get_documents (dict) != NULL)
674 sys_error (r, _("Multiple type 6 (document) records."));
676 line_cnt = read_int (r);
678 sys_error (r, _("Number of document lines (%d) "
679 "must be greater than 0."), line_cnt);
681 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
682 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
683 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
684 dict_set_documents (dict, documents);
686 sys_error (r, _("Document line contains null byte."));
687 pool_free (r->pool, documents);
690 /* Read a type 7 extension record. */
692 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
693 struct sfm_read_info *info)
695 int subtype = read_int (r);
696 size_t size = read_int (r);
697 size_t count = read_int (r);
698 size_t bytes = size * count;
700 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
701 allows an extra byte for a null terminator, used by some
702 extension processing routines. */
703 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
704 sys_error (r, "Record type 7 subtype %d too large.", subtype);
709 read_machine_integer_info (r, size, count, info);
713 read_machine_float_info (r, size, count);
717 /* Variable sets information. We don't use these yet.
718 They only apply to GUIs; see VARSETS on the APPLY
719 DICTIONARY command in SPSS documentation. */
723 /* DATE variable information. We don't use it yet, but we
728 /* Unknown purpose. */
732 read_display_parameters (r, size, count, dict);
736 read_long_var_name_map (r, size, count, dict);
740 read_long_string_map (r, size, count, dict);
744 /* New in SPSS v14? Unknown purpose. */
748 /* Text field that defines variable attributes. New in
753 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
757 skip_bytes (r, bytes);
760 /* Read record type 7, subtype 3. */
762 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
763 struct sfm_read_info *info)
765 int version_major = read_int (r);
766 int version_minor = read_int (r);
767 int version_revision = read_int (r);
768 int machine_code UNUSED = read_int (r);
769 int float_representation = read_int (r);
770 int compression_code UNUSED = read_int (r);
771 int integer_representation = read_int (r);
772 int character_code UNUSED = read_int (r);
774 int expected_float_format;
775 int expected_integer_format;
777 if (size != 4 || count != 8)
778 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
782 /* Save version info. */
783 info->version_major = version_major;
784 info->version_minor = version_minor;
785 info->version_revision = version_revision;
787 /* Check floating point format. */
788 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
789 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
790 expected_float_format = 1;
791 else if (r->float_format == FLOAT_Z_LONG)
792 expected_float_format = 2;
793 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
794 expected_float_format = 3;
797 if (float_representation != expected_float_format)
798 sys_error (r, _("Floating-point representation indicated by "
799 "system file (%d) differs from expected (%d)."),
800 r->float_format, expected_float_format);
802 /* Check integer format. */
803 if (r->integer_format == INTEGER_MSB_FIRST)
804 expected_integer_format = 1;
805 else if (r->integer_format == INTEGER_LSB_FIRST)
806 expected_integer_format = 2;
809 if (integer_representation != expected_integer_format)
811 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
812 sys_warn (r, _("Integer format indicated by system file (%s) "
813 "differs from expected (%s)."),
814 gettext (endian[integer_representation == 1]),
815 gettext (endian[expected_integer_format == 1]));
819 /* Read record type 7, subtype 4. */
821 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
823 double sysmis = read_float (r);
824 double highest = read_float (r);
825 double lowest = read_float (r);
827 if (size != 8 || count != 3)
828 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
831 if (sysmis != SYSMIS)
832 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
833 if (highest != HIGHEST)
834 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
835 if (lowest != LOWEST)
836 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
839 /* Read record type 7, subtype 11, which specifies how variables
840 should be displayed in GUI environments. */
842 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
843 struct dictionary *dict)
845 const size_t n_vars = count / 3 ;
849 if (count % 3 || n_vars != dict_get_var_cnt (dict))
850 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 11."),
853 for (i = 0; i < n_vars; ++i)
855 struct variable *v = dict_get_var (dict, i);
856 int measure = read_int (r);
857 int width = read_int (r);
858 int align = read_int (r);
860 /* SPSS 14 sometimes seems to set string variables' measure
862 if (0 == measure && var_is_alpha (v))
865 /* Older versions (SPSS 9.0) sometimes set the display width
866 to zero. This causes confusion especially in the GUI */
870 if (measure < 1 || measure > 3 || align < 0 || align > 2)
873 sys_warn (r, _("Invalid variable display parameters. "
874 "Default parameters substituted."));
879 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
880 : measure == 2 ? MEASURE_ORDINAL
882 var_set_display_width (v, width);
883 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
884 : align == 1 ? ALIGN_RIGHT
889 /* Reads record type 7, subtype 13, which gives the long name
890 that corresponds to each short name. Modifies variable names
891 in DICT accordingly. */
893 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
894 struct dictionary *dict)
896 struct variable_to_value_map *map;
897 struct variable *var;
901 map = open_variable_to_value_map (r, size * count);
902 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
906 size_t short_name_cnt;
909 /* Validate long name. */
910 if (!var_is_valid_name (long_name, false))
912 sys_warn (r, _("Long variable mapping from %s to invalid "
913 "variable name `%s'."),
914 var_get_name (var), long_name);
918 /* Identify any duplicates. */
919 if (strcasecmp (var_get_short_name (var, 0), long_name)
920 && dict_lookup_var (dict, long_name) != NULL)
922 sys_warn (r, _("Duplicate long variable name `%s' "
923 "within system file."), long_name);
927 /* Renaming a variable may clear its short names, but we
928 want to retain them, so we save them and re-set them
930 short_name_cnt = var_get_short_name_cnt (var);
931 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
932 for (i = 0; i < short_name_cnt; i++)
934 const char *s = var_get_short_name (var, i);
935 short_names[i] = s != NULL ? xstrdup (s) : NULL;
939 dict_rename_var (dict, var, long_name);
941 /* Restore short names. */
942 for (i = 0; i < short_name_cnt; i++)
944 var_set_short_name (var, i, short_names[i]);
945 free (short_names[i]);
949 close_variable_to_value_map (r, map);
950 r->has_long_var_names = true;
953 /* Reads record type 7, subtype 14, which gives the real length
954 of each very long string. Rearranges DICT accordingly. */
956 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
957 struct dictionary *dict)
959 struct variable_to_value_map *map;
960 struct variable *var;
964 map = open_variable_to_value_map (r, size * count);
965 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
968 size_t idx = var_get_dict_index (var);
974 length = strtol (length_s, NULL, 10);
975 if (length < 1 || length > MAX_STRING)
977 sys_warn (r, _("%s listed as string of invalid length %s "
978 "in very length string record."),
979 var_get_name (var), length_s);
983 /* Check segments. */
984 segment_cnt = sfm_width_to_segments (length);
985 if (segment_cnt == 1)
987 sys_warn (r, _("%s listed in very long string record with width %s, "
988 "which requires only one segment."),
989 var_get_name (var), length_s);
992 if (idx + segment_cnt > dict_get_var_cnt (dict))
993 sys_error (r, _("Very long string %s overflows dictionary."),
996 /* Get the short names from the segments and check their
998 for (i = 0; i < segment_cnt; i++)
1000 struct variable *seg = dict_get_var (dict, idx + i);
1001 int alloc_width = sfm_segment_alloc_width (length, i);
1002 int width = var_get_width (seg);
1005 var_set_short_name (var, i, var_get_short_name (seg, 0));
1006 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1007 sys_error (r, _("Very long string with width %ld has segment %d "
1008 "of width %d (expected %d)"),
1009 length, i, width, alloc_width);
1011 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1012 var_set_width (var, length);
1014 close_variable_to_value_map (r, map);
1015 dict_compact_values (dict);
1018 /* Reads value labels from sysfile H and inserts them into the
1019 associated dictionary. */
1021 read_value_labels (struct sfm_reader *r,
1022 struct dictionary *dict, struct variable **var_by_value_idx)
1024 struct pool *subpool;
1028 char raw_value[8]; /* Value as uninterpreted bytes. */
1029 union value value; /* Value. */
1030 char *label; /* Null-terminated label string. */
1033 struct label *labels = NULL;
1034 int label_cnt; /* Number of labels. */
1036 struct variable **var = NULL; /* Associated variables. */
1037 int var_cnt; /* Number of associated variables. */
1041 subpool = pool_create_subpool (r->pool);
1043 /* Read the type 3 record and record its contents. We can't do
1044 much with the data yet because we don't know whether it is
1045 of numeric or string type. */
1047 /* Read number of labels. */
1048 label_cnt = read_int (r);
1050 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1052 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1057 /* Read each value/label tuple into labels[]. */
1058 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1059 for (i = 0; i < label_cnt; i++)
1061 struct label *label = labels + i;
1062 unsigned char label_len;
1066 read_bytes (r, label->raw_value, sizeof label->raw_value);
1068 /* Read label length. */
1069 read_bytes (r, &label_len, sizeof label_len);
1070 padded_len = ROUND_UP (label_len + 1, 8);
1072 /* Read label, padding. */
1073 label->label = pool_alloc (subpool, padded_len + 1);
1074 read_bytes (r, label->label, padded_len - 1);
1075 label->label[label_len] = 0;
1078 /* Now, read the type 4 record that has the list of variables
1079 to which the value labels are to be applied. */
1081 /* Read record type of type 4 record. */
1082 if (read_int (r) != 4)
1083 sys_error (r, _("Variable index record (type 4) does not immediately "
1084 "follow value label record (type 3) as it should."));
1086 /* Read number of variables associated with value label from type 4
1088 var_cnt = read_int (r);
1089 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1090 sys_error (r, _("Number of variables associated with a value label (%d) "
1091 "is not between 1 and the number of variables (%zu)."),
1092 var_cnt, dict_get_var_cnt (dict));
1094 /* Read the list of variables. */
1095 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1096 for (i = 0; i < var_cnt; i++)
1098 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1099 if (var_is_long_string (var[i]))
1100 sys_error (r, _("Value labels are not allowed on long string "
1101 "variables (%s)."), var_get_name (var[i]));
1104 /* Type check the variables. */
1105 for (i = 1; i < var_cnt; i++)
1106 if (var_get_type (var[i]) != var_get_type (var[0]))
1107 sys_error (r, _("Variables associated with value label are not all of "
1108 "identical type. Variable %s is %s, but variable "
1110 var_get_name (var[0]),
1111 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1112 var_get_name (var[i]),
1113 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1115 /* Fill in labels[].value, now that we know the desired type. */
1116 for (i = 0; i < label_cnt; i++)
1118 struct label *label = labels + i;
1120 if (var_is_alpha (var[0]))
1121 buf_copy_rpad (label->value.s, sizeof label->value.s,
1122 label->raw_value, sizeof label->raw_value);
1124 label->value.f = float_get_double (r->float_format, label->raw_value);
1127 /* Assign the `value_label's to each variable. */
1128 for (i = 0; i < var_cnt; i++)
1130 struct variable *v = var[i];
1133 /* Add each label to the variable. */
1134 for (j = 0; j < label_cnt; j++)
1136 struct label *label = &labels[j];
1137 if (!var_add_value_label (v, &label->value, label->label))
1139 if (var_is_numeric (var[0]))
1140 sys_warn (r, _("Duplicate value label for %g on %s."),
1141 label->value.f, var_get_name (v));
1143 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1144 var_get_width (v), label->value.s,
1150 pool_destroy (subpool);
1155 static void partial_record (struct sfm_reader *r)
1158 static void read_error (struct casereader *, const struct sfm_reader *);
1160 static bool read_case_number (struct sfm_reader *, double *);
1161 static bool read_case_string (struct sfm_reader *, char *, size_t);
1162 static int read_opcode (struct sfm_reader *);
1163 static bool read_compressed_number (struct sfm_reader *, double *);
1164 static bool read_compressed_string (struct sfm_reader *, char *);
1165 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1166 static bool skip_whole_strings (struct sfm_reader *, size_t);
1168 /* Reads one case from READER's file into C. Returns true only
1171 sys_file_casereader_read (struct casereader *reader, void *r_,
1174 struct sfm_reader *r = r_;
1180 case_create (c, r->value_cnt);
1181 if (setjmp (r->bail_out))
1183 casereader_force_error (reader);
1188 for (i = 0; i < r->sfm_var_cnt; i++)
1190 struct sfm_var *sv = &r->sfm_vars[i];
1191 union value *v = case_data_rw_idx (c, sv->case_index);
1195 if (!read_case_number (r, &v->f))
1200 if (!read_case_string (r, v->s + sv->offset, sv->width))
1202 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1212 if (r->case_cnt != -1)
1213 read_error (reader, r);
1217 /* Issues an error that R ends in a partial record. */
1219 partial_record (struct sfm_reader *r)
1221 sys_error (r, _("File ends in partial case."));
1224 /* Issues an error that an unspecified error occurred SFM, and
1227 read_error (struct casereader *r, const struct sfm_reader *sfm)
1229 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1230 casereader_force_error (r);
1233 /* Reads a number from R and stores its value in *D.
1234 If R is compressed, reads a compressed number;
1235 otherwise, reads a number in the regular way.
1236 Returns true if successful, false if end of file is
1237 reached immediately. */
1239 read_case_number (struct sfm_reader *r, double *d)
1244 if (!try_read_bytes (r, number, sizeof number))
1246 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1250 return read_compressed_number (r, d);
1253 /* Reads LENGTH string bytes from R into S.
1254 Always reads a multiple of 8 bytes; if LENGTH is not a
1255 multiple of 8, then extra bytes are read and discarded without
1257 Reads compressed strings if S is compressed.
1258 Returns true if successful, false if end of file is
1259 reached immediately. */
1261 read_case_string (struct sfm_reader *r, char *s, size_t length)
1263 size_t whole = ROUND_DOWN (length, 8);
1264 size_t partial = length % 8;
1268 if (!read_whole_strings (r, s, whole))
1275 if (!read_whole_strings (r, bounce, sizeof bounce))
1281 memcpy (s + whole, bounce, partial);
1287 /* Reads and returns the next compression opcode from R. */
1289 read_opcode (struct sfm_reader *r)
1291 assert (r->compressed);
1295 if (r->opcode_idx >= sizeof r->opcodes)
1297 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1301 opcode = r->opcodes[r->opcode_idx++];
1308 /* Reads a compressed number from R and stores its value in D.
1309 Returns true if successful, false if end of file is
1310 reached immediately. */
1312 read_compressed_number (struct sfm_reader *r, double *d)
1314 int opcode = read_opcode (r);
1322 *d = read_float (r);
1326 sys_error (r, _("Compressed data is corrupt."));
1333 *d = opcode - r->bias;
1340 /* Reads a compressed 8-byte string segment from R and stores it
1342 Returns true if successful, false if end of file is
1343 reached immediately. */
1345 read_compressed_string (struct sfm_reader *r, char *dst)
1347 switch (read_opcode (r))
1354 read_bytes (r, dst, 8);
1358 memset (dst, ' ', 8);
1362 sys_error (r, _("Compressed data is corrupt."));
1368 /* Reads LENGTH string bytes from R into S.
1369 LENGTH must be a multiple of 8.
1370 Reads compressed strings if S is compressed.
1371 Returns true if successful, false if end of file is
1372 reached immediately. */
1374 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1376 assert (length % 8 == 0);
1378 return try_read_bytes (r, s, length);
1382 for (ofs = 0; ofs < length; ofs += 8)
1383 if (!read_compressed_string (r, s + ofs))
1393 /* Skips LENGTH string bytes from R.
1394 LENGTH must be a multiple of 8.
1395 (LENGTH is also limited to 1024, but that's only because the
1396 current caller never needs more than that many bytes.)
1397 Returns true if successful, false if end of file is
1398 reached immediately. */
1400 skip_whole_strings (struct sfm_reader *r, size_t length)
1403 assert (length < sizeof buffer);
1404 return read_whole_strings (r, buffer, length);
1407 /* Creates and returns a table that can be used for translating a value
1408 index into a case to a "struct variable *" for DICT. Multiple
1409 system file fields reference variables this way.
1411 This table must be created before processing the very long
1412 string extension record, because that record causes some
1413 values to be deleted from the case and the dictionary to be
1415 static struct variable **
1416 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1418 struct variable **var_by_value_idx;
1422 var_by_value_idx = pool_nmalloc (r->pool,
1423 r->oct_cnt, sizeof *var_by_value_idx);
1424 for (i = 0; i < dict_get_var_cnt (dict); i++)
1426 struct variable *v = dict_get_var (dict, i);
1427 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1430 var_by_value_idx[value_idx++] = v;
1431 for (j = 1; j < nv; j++)
1432 var_by_value_idx[value_idx++] = NULL;
1434 assert (value_idx == r->oct_cnt);
1436 return var_by_value_idx;
1439 /* Returns the "struct variable" corresponding to the given
1440 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1442 static struct variable *
1443 lookup_var_by_value_idx (struct sfm_reader *r,
1444 struct variable **var_by_value_idx, int value_idx)
1446 struct variable *var;
1448 if (value_idx < 1 || value_idx > r->oct_cnt)
1449 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1450 value_idx, r->oct_cnt);
1452 var = var_by_value_idx[value_idx - 1];
1454 sys_error (r, _("Variable index %d refers to long string "
1461 /* Returns the variable in D with the given SHORT_NAME,
1462 or a null pointer if there is none. */
1463 static struct variable *
1464 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1466 struct variable *var;
1470 /* First try looking up by full name. This often succeeds. */
1471 var = dict_lookup_var (d, short_name);
1472 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
1475 /* Iterate through the whole dictionary as a fallback. */
1476 var_cnt = dict_get_var_cnt (d);
1477 for (i = 0; i < var_cnt; i++)
1479 var = dict_get_var (d, i);
1480 if (!strcasecmp (var_get_short_name (var, 0), short_name))
1487 /* Helpers for reading records that contain "variable=value"
1491 struct variable_to_value_map
1493 struct substring buffer; /* Record contents. */
1494 size_t pos; /* Current position in buffer. */
1497 /* Reads SIZE bytes into a "variable=value" map for R,
1498 and returns the map. */
1499 static struct variable_to_value_map *
1500 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1502 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1503 char *buffer = pool_malloc (r->pool, size + 1);
1504 read_bytes (r, buffer, size);
1505 map->buffer = ss_buffer (buffer, size);
1510 /* Closes MAP and frees its storage.
1511 Not really needed, because the pool will free the map anyway,
1512 but can be used to free it earlier. */
1514 close_variable_to_value_map (struct sfm_reader *r,
1515 struct variable_to_value_map *map)
1517 pool_free (r->pool, ss_data (map->buffer));
1520 /* Reads the next variable=value pair from MAP.
1521 Looks up the variable in DICT and stores it into *VAR.
1522 Stores a null-terminated value into *VALUE. */
1524 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1525 struct variable_to_value_map *map,
1526 struct variable **var, char **value,
1529 int max_warnings = 5;
1533 struct substring short_name_ss, value_ss;
1535 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1536 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1539 if (*warning_cnt > max_warnings)
1540 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1541 *warning_cnt - max_warnings);
1545 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1546 ss_buffer ("\t\0", 2));
1548 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1549 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1552 if (++*warning_cnt <= max_warnings)
1553 sys_warn (r, _("Variable map refers to unknown variable %s."),
1554 ss_data (short_name_ss));
1558 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1559 *value = ss_data (value_ss);
1567 /* Displays a corruption message. */
1569 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1574 ds_init_empty (&text);
1575 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1576 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1577 ds_put_vformat (&text, format, args);
1579 m.category = msg_class_to_category (class);
1580 m.severity = msg_class_to_severity (class);
1581 m.where.file_name = NULL;
1582 m.where.line_number = 0;
1583 m.text = ds_cstr (&text);
1588 /* Displays a warning for the current file position. */
1590 sys_warn (struct sfm_reader *r, const char *format, ...)
1594 va_start (args, format);
1595 sys_msg (r, MW, format, args);
1599 /* Displays an error for the current file position,
1600 marks it as in an error state,
1601 and aborts reading it using longjmp. */
1603 sys_error (struct sfm_reader *r, const char *format, ...)
1607 va_start (args, format);
1608 sys_msg (r, ME, format, args);
1612 longjmp (r->bail_out, 1);
1615 /* Reads BYTE_CNT bytes into BUF.
1616 Returns true if exactly BYTE_CNT bytes are successfully read.
1617 Aborts if an I/O error or a partial read occurs.
1618 If EOF_IS_OK, then an immediate end-of-file causes false to be
1619 returned; otherwise, immediate end-of-file causes an abort
1622 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1623 void *buf, size_t byte_cnt)
1625 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1626 if (bytes_read == byte_cnt)
1628 else if (ferror (r->file))
1629 sys_error (r, _("System error: %s."), strerror (errno));
1630 else if (!eof_is_ok || bytes_read != 0)
1631 sys_error (r, _("Unexpected end of file."));
1636 /* Reads BYTE_CNT into BUF.
1637 Aborts upon I/O error or if end-of-file is encountered. */
1639 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1641 read_bytes_internal (r, false, buf, byte_cnt);
1644 /* Reads BYTE_CNT bytes into BUF.
1645 Returns true if exactly BYTE_CNT bytes are successfully read.
1646 Returns false if an immediate end-of-file is encountered.
1647 Aborts if an I/O error or a partial read occurs. */
1649 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1651 return read_bytes_internal (r, true, buf, byte_cnt);
1654 /* Reads a 32-bit signed integer from R and returns its value in
1657 read_int (struct sfm_reader *r)
1660 read_bytes (r, integer, sizeof integer);
1661 return integer_get (r->integer_format, integer, sizeof integer);
1664 /* Reads a 64-bit floating-point number from R and returns its
1665 value in host format. */
1667 read_float (struct sfm_reader *r)
1670 read_bytes (r, number, sizeof number);
1671 return float_get_double (r->float_format, number);
1674 /* Reads exactly SIZE - 1 bytes into BUFFER
1675 and stores a null byte into BUFFER[SIZE - 1]. */
1677 read_string (struct sfm_reader *r, char *buffer, size_t size)
1680 read_bytes (r, buffer, size - 1);
1681 buffer[size - 1] = '\0';
1684 /* Skips BYTES bytes forward in R. */
1686 skip_bytes (struct sfm_reader *r, size_t bytes)
1691 size_t chunk = MIN (sizeof buffer, bytes);
1692 read_bytes (r, buffer, chunk);
1697 static struct casereader_class sys_file_casereader_class =
1699 sys_file_casereader_read,
1700 sys_file_casereader_destroy,