1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/alloc.h>
29 #include <libpspp/assertion.h>
30 #include <libpspp/message.h>
31 #include <libpspp/compiler.h>
32 #include <libpspp/magic.h>
33 #include <libpspp/misc.h>
34 #include <libpspp/pool.h>
35 #include <libpspp/str.h>
36 #include <libpspp/hash.h>
37 #include <libpspp/array.h>
39 #include <data/case.h>
40 #include <data/casereader-provider.h>
41 #include <data/casereader.h>
42 #include <data/dictionary.h>
43 #include <data/file-handle-def.h>
44 #include <data/file-name.h>
45 #include <data/format.h>
46 #include <data/missing-values.h>
47 #include <data/value-labels.h>
48 #include <data/variable.h>
49 #include <data/value.h>
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
72 size_t value_cnt; /* Number of "union value"s in struct case. */
75 enum integer_format integer_format; /* On-disk integer format. */
76 enum float_format float_format; /* On-disk floating point format. */
77 int flt64_cnt; /* Number of 8-byte units per case. */
78 struct sfm_var *vars; /* Variables. */
79 size_t var_cnt; /* Number of variables. */
80 bool has_long_var_names; /* File has a long variable name map */
81 bool has_vls; /* File has one or more very long strings? */
84 bool compressed; /* File is compressed? */
85 double bias; /* Compression bias, usually 100.0. */
86 uint8_t opcodes[8]; /* Current block of opcodes. */
87 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
90 /* A variable in a system file. */
93 int width; /* 0=numeric, otherwise string width. */
94 int case_index; /* Index into case. */
97 static struct casereader_class sys_file_casereader_class;
99 static bool close_reader (struct sfm_reader *);
101 static struct variable **make_var_by_value_idx (struct sfm_reader *,
102 struct dictionary *);
103 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
107 static void sys_warn (struct sfm_reader *, const char *, ...)
108 PRINTF_FORMAT (2, 3);
110 static void sys_error (struct sfm_reader *, const char *, ...)
114 static void read_bytes (struct sfm_reader *, void *, size_t);
115 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
116 static int32_t read_int32 (struct sfm_reader *);
117 static double read_flt64 (struct sfm_reader *);
118 static void read_string (struct sfm_reader *, char *, size_t);
119 static void skip_bytes (struct sfm_reader *, size_t);
121 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
122 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
124 static struct variable_to_value_map *open_variable_to_value_map (
125 struct sfm_reader *, size_t size);
126 static void close_variable_to_value_map (struct sfm_reader *r,
127 struct variable_to_value_map *);
128 static bool read_variable_to_value_map (struct sfm_reader *,
130 struct variable_to_value_map *,
131 struct variable **var, char **value,
134 static bool close_reader (struct sfm_reader *r);
136 /* Dictionary reader. */
144 static void read_header (struct sfm_reader *, struct dictionary *,
145 int *weight_idx, int *claimed_flt64_cnt,
146 struct sfm_read_info *);
147 static void read_variable_record (struct sfm_reader *, struct dictionary *,
148 int *format_warning_cnt);
149 static void parse_format_spec (struct sfm_reader *, uint32_t,
150 enum which_format, struct variable *,
151 int *format_warning_cnt);
152 static void setup_weight (struct sfm_reader *, int weight_idx,
153 struct variable **var_by_value_idx,
154 struct dictionary *);
155 static void read_documents (struct sfm_reader *, struct dictionary *);
156 static void read_value_labels (struct sfm_reader *, struct dictionary *,
157 struct variable **var_by_value_idx);
159 static void read_extension_record (struct sfm_reader *, struct dictionary *);
160 static void read_machine_int32_info (struct sfm_reader *,
161 size_t size, size_t count);
162 static void read_machine_flt64_info (struct sfm_reader *,
163 size_t size, size_t count);
164 static void read_display_parameters (struct sfm_reader *,
165 size_t size, size_t count,
166 struct dictionary *);
167 static void read_long_var_name_map (struct sfm_reader *,
168 size_t size, size_t count,
169 struct dictionary *);
170 static void read_long_string_map (struct sfm_reader *,
171 size_t size, size_t count,
172 struct dictionary *);
175 /* Opens the system file designated by file handle FH for
176 reading. Reads the system file's dictionary into *DICT.
177 If INFO is non-null, then it receives additional info about the
180 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
181 struct sfm_read_info *info)
183 struct sfm_reader *volatile r = NULL;
184 struct variable **var_by_value_idx;
185 int format_warning_cnt = 0;
187 int claimed_flt64_cnt;
191 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
194 *dict = dict_create ();
196 /* Create and initialize reader. */
197 r = pool_create_container (struct sfm_reader, pool);
199 r->file = fn_open (fh_get_file_name (fh), "rb");
203 r->has_long_var_names = false;
204 r->opcode_idx = sizeof r->opcodes;
206 if (setjmp (r->bail_out))
209 dict_destroy (*dict);
216 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
217 fh_get_file_name (r->fh), strerror (errno));
218 longjmp (r->bail_out, 1);
222 read_header (r, *dict, &weight_idx, &claimed_flt64_cnt, info);
224 /* Read all the variable definition records. */
225 rec_type = read_int32 (r);
226 while (rec_type == 2)
228 read_variable_record (r, *dict, &format_warning_cnt);
229 rec_type = read_int32 (r);
232 /* Figure out the case format. */
233 var_by_value_idx = make_var_by_value_idx (r, *dict);
234 setup_weight (r, weight_idx, var_by_value_idx, *dict);
236 /* Read all the rest of the dictionary records. */
237 while (rec_type != 999)
242 read_value_labels (r, *dict, var_by_value_idx);
246 sys_error (r, _("Misplaced type 4 record."));
249 read_documents (r, *dict);
253 read_extension_record (r, *dict);
257 sys_error (r, _("Unrecognized record type %d."), rec_type);
259 rec_type = read_int32 (r);
263 if ( ! r->has_long_var_names )
266 for (i = 0; i < dict_get_var_cnt (*dict); i++)
268 struct variable *var = dict_get_var (*dict, i);
269 char short_name [SHORT_NAME_LEN + 1];
270 char long_name [SHORT_NAME_LEN + 1];
272 strcpy (short_name, var_get_name (var));
274 strcpy (long_name, short_name);
275 str_lowercase (long_name);
277 /* Set long name. Renaming a variable may clear the short
278 name, but we want to retain it, so re-set it
280 dict_rename_var (*dict, var, long_name);
281 var_set_short_name (var, short_name);
284 r->has_long_var_names = true;
287 /* Read record 999 data, which is just filler. */
290 if (claimed_flt64_cnt != -1 && claimed_flt64_cnt != r->flt64_cnt)
291 sys_warn (r, _("File header claims %d variable positions but "
292 "%d were read from file."),
293 claimed_flt64_cnt, r->flt64_cnt);
295 /* Create an index of dictionary variable widths for
296 sfm_read_case to use. We cannot use the `struct variable's
297 from the dictionary we created, because the caller owns the
298 dictionary and may destroy or modify its variables. */
299 r->var_cnt = dict_get_var_cnt (*dict);
300 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
301 for (i = 0; i < r->var_cnt; i++)
303 struct variable *v = dict_get_var (*dict, i);
304 struct sfm_var *sv = &r->vars[i];
305 sv->width = var_get_width (v);
306 sv->case_index = var_get_case_index (v);
309 pool_free (r->pool, var_by_value_idx);
310 r->value_cnt = dict_get_next_value_idx (*dict);
311 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
312 &sys_file_casereader_class, r);
315 /* Closes a system file after we're done with it.
316 Returns true if an I/O error has occurred on READER, false
319 close_reader (struct sfm_reader *r)
328 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
330 msg (ME, _("Error closing system file \"%s\": %s."),
331 fh_get_file_name (r->fh), strerror (errno));
338 fh_close (r->fh, "system file", "rs");
341 pool_destroy (r->pool);
346 /* Destroys READER. */
348 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
350 struct sfm_reader *r = r_;
354 /* Returns true if FILE is an SPSS system file,
357 sfm_detect (FILE *file)
361 if (fread (rec_type, 4, 1, file) != 1)
365 return !strcmp ("$FL2", rec_type);
368 /* Reads the global header of the system file.
369 Sets DICT's file label to the system file's label.
370 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
371 or to the value index of the weight variable otherwise.
372 Sets *CLAIMED_FLT64_CNT to the number of values that the file
373 claims to have (although it is not always correct).
374 If INFO is non-null, initializes *INFO with header
377 read_header (struct sfm_reader *r, struct dictionary *dict,
378 int *weight_idx, int *claimed_flt64_cnt,
379 struct sfm_read_info *info)
382 char eye_catcher[61];
383 uint8_t raw_layout_code[4];
386 char creation_date[10];
387 char creation_time[9];
389 struct substring file_label_ss;
391 read_string (r, rec_type, sizeof rec_type);
392 read_string (r, eye_catcher, sizeof eye_catcher);
394 if (strcmp ("$FL2", rec_type) != 0)
395 sys_error (r, _("This is not an SPSS system file."));
397 /* Identify integer format. */
398 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
399 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
401 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
403 || (r->integer_format != INTEGER_MSB_FIRST
404 && r->integer_format != INTEGER_LSB_FIRST))
405 sys_error (r, _("This is not an SPSS system file."));
407 *claimed_flt64_cnt = read_int32 (r);
408 if (*claimed_flt64_cnt < 0 || *claimed_flt64_cnt > INT_MAX / 16)
409 *claimed_flt64_cnt = -1;
411 r->compressed = read_int32 (r) != 0;
413 *weight_idx = read_int32 (r);
415 case_cnt = read_int32 (r);
416 if (case_cnt < -1 || case_cnt > INT_MAX / 2)
419 /* Identify floating-point format and obtain compression bias. */
420 read_bytes (r, raw_bias, sizeof raw_bias);
421 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
423 sys_warn (r, _("Compression bias (%g) is not the usual "
424 "value of 100, or system file uses unrecognized "
425 "floating-point format."),
427 if (r->integer_format == INTEGER_MSB_FIRST)
428 r->float_format = FLOAT_IEEE_DOUBLE_BE;
430 r->float_format = FLOAT_IEEE_DOUBLE_LE;
432 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
434 read_string (r, creation_date, sizeof creation_date);
435 read_string (r, creation_time, sizeof creation_time);
436 read_string (r, file_label, sizeof file_label);
439 file_label_ss = ss_cstr (file_label);
440 ss_trim (&file_label_ss, ss_cstr (" "));
441 if (!ss_is_empty (file_label_ss))
443 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
444 dict_set_label (dict, ss_data (file_label_ss));
449 struct substring product;
451 strcpy (info->creation_date, creation_date);
452 strcpy (info->creation_time, creation_time);
453 info->integer_format = r->integer_format;
454 info->float_format = r->float_format;
455 info->compressed = r->compressed;
456 info->case_cnt = case_cnt;
458 product = ss_cstr (eye_catcher);
459 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
460 ss_trim (&product, ss_cstr (" "));
461 str_copy_buf_trunc (info->product, sizeof info->product,
462 ss_data (product), ss_length (product));
466 /* Reads a variable (type 2) record from R and adds the
467 corresponding variable to DICT.
468 Also skips past additional variable records for long string
471 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
472 int *format_warning_cnt)
475 int has_variable_label;
476 int missing_value_code;
481 struct variable *var;
484 width = read_int32 (r);
485 has_variable_label = read_int32 (r);
486 missing_value_code = read_int32 (r);
487 print_format = read_int32 (r);
488 write_format = read_int32 (r);
489 read_string (r, name, sizeof name);
490 name[strcspn (name, " ")] = '\0';
492 /* Check variable name. */
493 if (name[0] == '$' || name[0] == '#')
494 sys_error (r, "Variable name begins with invalid character `%c'.",
496 if (!var_is_plausible_name (name, false))
497 sys_error (r, _("Invalid variable name `%s'."), name);
499 /* Create variable. */
500 if (width < 0 || width > 255)
501 sys_error (r, _("Bad variable width %d."), width);
502 var = dict_create_var (dict, name, width);
505 _("Duplicate variable name `%s' within system file."),
508 /* Set the short name the same as the long name */
509 var_set_short_name (var, var_get_name (var));
511 /* Get variable label, if any. */
512 if (has_variable_label != 0 && has_variable_label != 1)
513 sys_error (r, _("Variable label indicator field is not 0 or 1."));
514 if (has_variable_label == 1)
519 len = read_int32 (r);
520 if (len >= sizeof label)
521 sys_error (r, _("Variable %s has label of invalid length %u."),
522 name, (unsigned int) len);
523 read_string (r, label, len + 1);
524 var_set_label (var, label);
526 skip_bytes (r, ROUND_UP (len, 4) - len);
529 /* Set missing values. */
530 if (missing_value_code < -3 || missing_value_code > 3
531 || missing_value_code == -1)
532 sys_error (r, _("Missing value indicator field is not "
533 "-3, -2, 0, 1, 2, or 3."));
534 if (missing_value_code != 0)
536 struct missing_values mv;
537 mv_init (&mv, var_get_width (var));
538 if (var_is_numeric (var))
540 if (missing_value_code > 0)
543 for (i = 0; i < missing_value_code; i++)
544 mv_add_num (&mv, read_flt64 (r));
548 double low = read_flt64 (r);
549 double high = read_flt64 (r);
550 mv_add_num_range (&mv, low, high);
551 if (missing_value_code == -3)
552 mv_add_num (&mv, read_flt64 (r));
555 else if (var_get_width (var) <= MAX_SHORT_STRING)
557 if (missing_value_code > 0)
560 for (i = 0; i < missing_value_code; i++)
563 read_string (r, string, sizeof string);
564 mv_add_str (&mv, string);
568 sys_error (r, _("String variable %s may not have missing "
569 "values specified as a range."),
572 else /* var->width > MAX_SHORT_STRING */
573 sys_error (r, _("Long string variable %s may not have missing "
576 var_set_missing_values (var, &mv);
580 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
581 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
583 /* Account for values.
584 Skip long string continuation records, if any. */
585 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
591 for (i = 1; i < nv; i++)
593 /* Check for record type 2 and width -1. */
594 if (read_int32 (r) != 2 || read_int32 (r) != -1)
595 sys_error (r, _("Missing string continuation record."));
597 /* Skip and ignore remaining continuation data. */
598 has_variable_label = read_int32 (r);
599 missing_value_code = read_int32 (r);
600 print_format = read_int32 (r);
601 write_format = read_int32 (r);
602 read_string (r, name, sizeof name);
604 /* Variable label fields on continuation records have
605 been spotted in system files created by "SPSS Power
606 Macintosh Release 6.1". */
607 if (has_variable_label)
608 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
613 /* Translates the format spec from sysfile format to internal
616 parse_format_spec (struct sfm_reader *r, uint32_t s,
617 enum which_format which, struct variable *v,
618 int *format_warning_cnt)
620 const int max_format_warnings = 8;
622 uint8_t raw_type = s >> 16;
628 if (!fmt_from_io (raw_type, &f.type))
629 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
634 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
639 if (which == PRINT_FORMAT)
640 var_set_print_format (v, &f);
642 var_set_write_format (v, &f);
644 else if (*++format_warning_cnt <= max_format_warnings)
646 char fmt_string[FMT_STRING_LEN_MAX + 1];
647 sys_warn (r, _("%s variable %s has invalid %s format %s."),
648 var_is_numeric (v) ? _("Numeric") : _("String"),
650 which == PRINT_FORMAT ? _("print") : _("write"),
651 fmt_to_string (&f, fmt_string));
653 if (*format_warning_cnt == max_format_warnings)
654 sys_warn (r, _("Suppressing further invalid format warnings."));
658 /* Sets the weighting variable in DICT to the variable
659 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
662 setup_weight (struct sfm_reader *r, int weight_idx,
663 struct variable **var_by_value_idx, struct dictionary *dict)
667 struct variable *weight_var
668 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
669 if (var_is_numeric (weight_var))
670 dict_set_weight (dict, weight_var);
672 sys_error (r, _("Weighting variable must be numeric."));
676 /* Reads a document record, type 6, from system file R, and sets up
677 the documents and n_documents fields in the associated
680 read_documents (struct sfm_reader *r, struct dictionary *dict)
685 if (dict_get_documents (dict) != NULL)
686 sys_error (r, _("Multiple type 6 (document) records."));
688 line_cnt = read_int32 (r);
690 sys_error (r, _("Number of document lines (%d) "
691 "must be greater than 0."), line_cnt);
693 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
694 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
695 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
696 dict_set_documents (dict, documents);
698 sys_error (r, _("Document line contains null byte."));
699 pool_free (r->pool, documents);
702 /* Read a type 7 extension record. */
704 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
706 int subtype = read_int32 (r);
707 size_t size = read_int32 (r);
708 size_t count = read_int32 (r);
709 size_t bytes = size * count;
711 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
712 allows an extra byte for a null terminator, used by some
713 extension processing routines. */
714 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
715 sys_error (r, "Record type 7 subtype %d too large.", subtype);
720 read_machine_int32_info (r, size, count);
724 read_machine_flt64_info (r, size, count);
728 /* Variable sets information. We don't use these yet.
729 They only apply to GUIs; see VARSETS on the APPLY
730 DICTIONARY command in SPSS documentation. */
734 /* DATE variable information. We don't use it yet, but we
739 /* Unknown purpose. */
743 read_display_parameters (r, size, count, dict);
747 read_long_var_name_map (r, size, count, dict);
751 read_long_string_map (r, size, count, dict);
755 /* New in SPSS v14? Unknown purpose. */
759 /* Text field that defines variable attributes. New in
764 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
768 skip_bytes (r, bytes);
771 /* Read record type 7, subtype 3. */
773 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
775 int version_major UNUSED = read_int32 (r);
776 int version_minor UNUSED = read_int32 (r);
777 int version_revision UNUSED = read_int32 (r);
778 int machine_code UNUSED = read_int32 (r);
779 int float_representation = read_int32 (r);
780 int compression_code UNUSED = read_int32 (r);
781 int integer_representation = read_int32 (r);
782 int character_code UNUSED = read_int32 (r);
784 int expected_float_format;
785 int expected_integer_format;
787 if (size != 4 || count != 8)
788 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
790 (unsigned int) size, (unsigned int) count);
792 /* Check floating point format. */
793 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
794 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
795 expected_float_format = 1;
796 else if (r->float_format == FLOAT_Z_LONG)
797 expected_float_format = 2;
798 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
799 expected_float_format = 3;
802 if (float_representation != expected_float_format)
803 sys_error (r, _("Floating-point representation indicated by "
804 "system file (%d) differs from expected (%d)."),
805 r->float_format, expected_float_format);
807 /* Check integer format. */
808 if (r->integer_format == INTEGER_MSB_FIRST)
809 expected_integer_format = 1;
810 else if (r->integer_format == INTEGER_LSB_FIRST)
811 expected_integer_format = 2;
814 if (integer_representation != expected_integer_format)
816 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
817 sys_warn (r, _("Integer format indicated by system file (%s) "
818 "differs from expected (%s)."),
819 gettext (endian[integer_representation == 1]),
820 gettext (endian[expected_integer_format == 1]));
824 /* Read record type 7, subtype 4. */
826 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
828 double sysmis = read_flt64 (r);
829 double highest = read_flt64 (r);
830 double lowest = read_flt64 (r);
832 if (size != 8 || count != 3)
833 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
834 (unsigned int) size, (unsigned int) count);
836 if (sysmis != SYSMIS)
837 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
838 if (highest != HIGHEST)
839 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
840 if (lowest != LOWEST)
841 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
844 /* Read record type 7, subtype 11, which specifies how variables
845 should be displayed in GUI environments. */
847 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
848 struct dictionary *dict)
850 const size_t n_vars = count / 3 ;
854 if (count % 3 || n_vars != dict_get_var_cnt (dict))
855 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
856 (unsigned int) size, (unsigned int) count);
858 for (i = 0; i < n_vars; ++i)
860 int measure = read_int32 (r);
861 int width = read_int32 (r);
862 int align = read_int32 (r);
863 struct variable *v = dict_get_var (dict, i);
865 /* spss v14 sometimes seems to set string variables' measure to zero */
866 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
869 if (measure < 1 || measure > 3 || align < 0 || align > 2)
872 sys_warn (r, _("Invalid variable display parameters. "
873 "Default parameters substituted."));
878 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
879 : measure == 2 ? MEASURE_ORDINAL
881 var_set_display_width (v, width);
882 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
883 : align == 1 ? ALIGN_RIGHT
888 /* Reads record type 7, subtype 13, which gives the long name
889 that corresponds to each short name. Modifies variable names
890 in DICT accordingly. */
892 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
893 struct dictionary *dict)
895 struct variable_to_value_map *map;
896 struct variable *var;
900 map = open_variable_to_value_map (r, size * count);
901 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
904 char short_name[SHORT_NAME_LEN + 1];
905 strcpy (short_name, var_get_short_name (var));
907 /* Validate long name. */
908 if (!var_is_valid_name (long_name, false))
910 sys_warn (r, _("Long variable mapping from %s to invalid "
911 "variable name `%s'."),
912 var_get_name (var), long_name);
916 /* Identify any duplicates. */
917 if (strcasecmp (short_name, long_name)
918 && dict_lookup_var (dict, long_name) != NULL)
920 sys_warn (r, _("Duplicate long variable name `%s' "
921 "within system file."), long_name);
925 /* Set long name. Renaming a variable may clear the short
926 name, but we want to retain it, so re-set it
928 dict_rename_var (dict, var, long_name);
929 var_set_short_name (var, short_name);
931 close_variable_to_value_map (r, map);
932 r->has_long_var_names = true;
935 /* Reads record type 7, subtype 14, which gives the real length
936 of each very long string. Rearranges DICT accordingly. */
938 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
939 struct dictionary *dict)
941 struct variable_to_value_map *map;
942 struct variable *var;
948 map = open_variable_to_value_map (r, size * count);
949 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
952 long length, remaining_length;
956 length = strtol (length_s, NULL, 10);
957 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
959 sys_warn (r, _("%s listed as string of length %s "
961 var_get_name (var), length_s);
965 /* Group multiple variables into single variable
966 and delete all but the first. */
967 remaining_length = length;
968 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
969 if (idx < dict_get_var_cnt (dict))
970 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
971 EFFECTIVE_LONG_STRING_LENGTH);
973 sys_error (r, _("Very long string %s overflows dictionary."),
975 dict_delete_consecutive_vars (dict,
976 var_get_dict_index (var) + 1,
977 idx - var_get_dict_index (var) - 1);
979 /* Assign all the length to the first variable. */
980 var_set_width (var, length);
982 close_variable_to_value_map (r, map);
983 dict_compact_values (dict);
986 /* Reads value labels from sysfile H and inserts them into the
987 associated dictionary. */
989 read_value_labels (struct sfm_reader *r,
990 struct dictionary *dict, struct variable **var_by_value_idx)
992 struct pool *subpool;
996 char raw_value[8]; /* Value as uninterpreted bytes. */
997 union value value; /* Value. */
998 char *label; /* Null-terminated label string. */
1001 struct label *labels = NULL;
1002 int label_cnt; /* Number of labels. */
1004 struct variable **var = NULL; /* Associated variables. */
1005 int var_cnt; /* Number of associated variables. */
1009 subpool = pool_create_subpool (r->pool);
1011 /* Read the type 3 record and record its contents. We can't do
1012 much with the data yet because we don't know whether it is
1013 of numeric or string type. */
1015 /* Read number of labels. */
1016 label_cnt = read_int32 (r);
1018 if (label_cnt >= INT32_MAX / sizeof *labels)
1020 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1025 /* Read each value/label tuple into labels[]. */
1026 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1027 for (i = 0; i < label_cnt; i++)
1029 struct label *label = labels + i;
1030 unsigned char label_len;
1034 read_bytes (r, label->raw_value, sizeof label->raw_value);
1036 /* Read label length. */
1037 read_bytes (r, &label_len, sizeof label_len);
1038 padded_len = ROUND_UP (label_len + 1, 8);
1040 /* Read label, padding. */
1041 label->label = pool_alloc (subpool, padded_len + 1);
1042 read_bytes (r, label->label, padded_len - 1);
1043 label->label[label_len] = 0;
1046 /* Now, read the type 4 record that has the list of variables
1047 to which the value labels are to be applied. */
1049 /* Read record type of type 4 record. */
1050 if (read_int32 (r) != 4)
1051 sys_error (r, _("Variable index record (type 4) does not immediately "
1052 "follow value label record (type 3) as it should."));
1054 /* Read number of variables associated with value label from type 4
1056 var_cnt = read_int32 (r);
1057 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1058 sys_error (r, _("Number of variables associated with a value label (%d) "
1059 "is not between 1 and the number of variables (%u)."),
1060 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1062 /* Read the list of variables. */
1063 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1064 for (i = 0; i < var_cnt; i++)
1066 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1067 if (var_is_long_string (var[i]))
1068 sys_error (r, _("Value labels are not allowed on long string "
1069 "variables (%s)."), var_get_name (var[i]));
1072 /* Type check the variables. */
1073 for (i = 1; i < var_cnt; i++)
1074 if (var_get_type (var[i]) != var_get_type (var[0]))
1075 sys_error (r, _("Variables associated with value label are not all of "
1076 "identical type. Variable %s is %s, but variable "
1078 var_get_name (var[0]),
1079 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1080 var_get_name (var[i]),
1081 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1083 /* Fill in labels[].value, now that we know the desired type. */
1084 for (i = 0; i < label_cnt; i++)
1086 struct label *label = labels + i;
1088 if (var_is_alpha (var[0]))
1089 buf_copy_rpad (label->value.s, sizeof label->value.s,
1090 label->raw_value, sizeof label->raw_value);
1092 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1095 /* Assign the `value_label's to each variable. */
1096 for (i = 0; i < var_cnt; i++)
1098 struct variable *v = var[i];
1101 /* Add each label to the variable. */
1102 for (j = 0; j < label_cnt; j++)
1104 struct label *label = &labels[j];
1105 if (!var_add_value_label (v, &label->value, label->label))
1107 if (var_is_numeric (var[0]))
1108 sys_warn (r, _("Duplicate value label for %g on %s."),
1109 label->value.f, var_get_name (v));
1111 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1112 var_get_width (v), label->value.s,
1118 pool_destroy (subpool);
1123 static void partial_record (struct sfm_reader *r)
1125 static bool read_case_number (struct sfm_reader *, double *);
1126 static bool read_case_string (struct sfm_reader *, char *, size_t);
1127 static int read_opcode (struct sfm_reader *);
1128 static bool read_compressed_number (struct sfm_reader *, double *);
1129 static bool read_compressed_string (struct sfm_reader *, char *);
1130 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1132 /* Reads one case from READER's file into C. Returns true only
1135 sys_file_casereader_read (struct casereader *reader, void *r_,
1138 struct sfm_reader *r = r_;
1142 case_create (c, r->value_cnt);
1143 if (setjmp (r->bail_out))
1145 casereader_force_error (reader);
1150 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1152 /* Fast path. Read the whole case directly. */
1153 if (!try_read_bytes (r, case_data_all_rw (c),
1154 sizeof (union value) * r->flt64_cnt))
1160 /* Convert floating point numbers to native format if needed. */
1161 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1165 for (i = 0; i < r->var_cnt; i++)
1166 if (r->vars[i].width == 0)
1168 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1169 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1176 /* Slow path. Convert from external to internal format. */
1179 for (i = 0; i < r->var_cnt; i++)
1181 struct sfm_var *sv = &r->vars[i];
1182 union value *v = case_data_rw_idx (c, sv->case_index);
1186 if (!read_case_number (r, &v->f))
1191 /* Read the string data in segments up to 255 bytes
1192 at a time, packed into 8-byte units. */
1193 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1194 int ofs, chunk_size;
1195 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1197 chunk_size = MIN (max_chunk, sv->width - ofs);
1198 if (!read_case_string (r, v->s + ofs, chunk_size))
1206 /* Very long strings have trailing wasted space
1207 that we must skip. */
1208 if (sv->width >= MIN_VERY_LONG_STRING)
1210 int bytes_read = (sv->width / max_chunk * 256
1211 + ROUND_UP (sv->width % max_chunk, 8));
1212 int total_bytes = sfm_width_to_bytes (sv->width);
1213 int excess_bytes = total_bytes - bytes_read;
1215 while (excess_bytes > 0)
1218 size_t chunk = MIN (sizeof buffer, excess_bytes);
1219 if (!read_whole_strings (r, buffer, chunk))
1221 excess_bytes -= chunk;
1236 /* Issues an error that R ends in a partial record. */
1238 partial_record (struct sfm_reader *r)
1240 sys_error (r, _("File ends in partial case."));
1243 /* Reads a number from R and stores its value in *D.
1244 If R is compressed, reads a compressed number;
1245 otherwise, reads a number in the regular way.
1246 Returns true if successful, false if end of file is
1247 reached immediately. */
1249 read_case_number (struct sfm_reader *r, double *d)
1254 if (!try_read_bytes (r, flt64, sizeof flt64))
1256 *d = flt64_to_double (r, flt64);
1260 return read_compressed_number (r, d);
1263 /* Reads LENGTH string bytes from R into S.
1264 Always reads a multiple of 8 bytes; if LENGTH is not a
1265 multiple of 8, then extra bytes are read and discarded without
1267 Reads compressed strings if S is compressed.
1268 Returns true if successful, false if end of file is
1269 reached immediately. */
1271 read_case_string (struct sfm_reader *r, char *s, size_t length)
1273 size_t whole = ROUND_DOWN (length, 8);
1274 size_t partial = length % 8;
1278 if (!read_whole_strings (r, s, whole))
1285 if (!read_whole_strings (r, bounce, sizeof bounce))
1291 memcpy (s + whole, bounce, partial);
1297 /* Reads and returns the next compression opcode from R. */
1299 read_opcode (struct sfm_reader *r)
1301 assert (r->compressed);
1305 if (r->opcode_idx >= sizeof r->opcodes)
1307 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1311 opcode = r->opcodes[r->opcode_idx++];
1318 /* Reads a compressed number from R and stores its value in D.
1319 Returns true if successful, false if end of file is
1320 reached immediately. */
1322 read_compressed_number (struct sfm_reader *r, double *d)
1324 int opcode = read_opcode (r);
1332 *d = read_flt64 (r);
1336 sys_error (r, _("Compressed data is corrupt."));
1343 *d = opcode - r->bias;
1350 /* Reads a compressed 8-byte string segment from R and stores it
1352 Returns true if successful, false if end of file is
1353 reached immediately. */
1355 read_compressed_string (struct sfm_reader *r, char *dst)
1357 switch (read_opcode (r))
1364 read_bytes (r, dst, 8);
1368 memset (dst, ' ', 8);
1372 sys_error (r, _("Compressed data is corrupt."));
1378 /* Reads LENGTH string bytes from R into S.
1379 LENGTH must be a multiple of 8.
1380 Reads compressed strings if S is compressed.
1381 Returns true if successful, false if end of file is
1382 reached immediately. */
1384 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1386 assert (length % 8 == 0);
1388 return try_read_bytes (r, s, length);
1392 for (ofs = 0; ofs < length; ofs += 8)
1393 if (!read_compressed_string (r, s + ofs))
1403 /* Creates and returns a table that can be used for translating a value
1404 index into a case to a "struct variable *" for DICT. Multiple
1405 system file fields reference variables this way.
1407 This table must be created before processing the very long
1408 string extension record, because that record causes some
1409 values to be deleted from the case and the dictionary to be
1411 static struct variable **
1412 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1414 struct variable **var_by_value_idx;
1418 var_by_value_idx = pool_nmalloc (r->pool,
1419 r->flt64_cnt, sizeof *var_by_value_idx);
1420 for (i = 0; i < dict_get_var_cnt (dict); i++)
1422 struct variable *v = dict_get_var (dict, i);
1423 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1426 var_by_value_idx[value_idx++] = v;
1427 for (j = 1; j < nv; j++)
1428 var_by_value_idx[value_idx++] = NULL;
1430 assert (value_idx == r->flt64_cnt);
1432 return var_by_value_idx;
1435 /* Returns the "struct variable" corresponding to the given
1436 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1438 static struct variable *
1439 lookup_var_by_value_idx (struct sfm_reader *r,
1440 struct variable **var_by_value_idx, int value_idx)
1442 struct variable *var;
1444 if (value_idx < 1 || value_idx > r->flt64_cnt)
1445 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1446 value_idx, r->flt64_cnt);
1448 var = var_by_value_idx[value_idx - 1];
1450 sys_error (r, _("Variable index %d refers to long string "
1457 /* Returns the variable in D with the given SHORT_NAME,
1458 or a null pointer if there is none. */
1459 static struct variable *
1460 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1462 struct variable *var;
1466 /* First try looking up by full name. This often succeeds. */
1467 var = dict_lookup_var (d, short_name);
1468 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1471 /* Iterate through the whole dictionary as a fallback. */
1472 var_cnt = dict_get_var_cnt (d);
1473 for (i = 0; i < var_cnt; i++)
1475 var = dict_get_var (d, i);
1476 if (!strcasecmp (var_get_short_name (var), short_name))
1483 /* Helpers for reading records that contain "variable=value"
1487 struct variable_to_value_map
1489 struct substring buffer; /* Record contents. */
1490 size_t pos; /* Current position in buffer. */
1493 /* Reads SIZE bytes into a "variable=value" map for R,
1494 and returns the map. */
1495 static struct variable_to_value_map *
1496 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1498 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1499 char *buffer = pool_malloc (r->pool, size + 1);
1500 read_bytes (r, buffer, size);
1501 map->buffer = ss_buffer (buffer, size);
1506 /* Closes MAP and frees its storage.
1507 Not really needed, because the pool will free the map anyway,
1508 but can be used to free it earlier. */
1510 close_variable_to_value_map (struct sfm_reader *r,
1511 struct variable_to_value_map *map)
1513 pool_free (r->pool, ss_data (map->buffer));
1516 /* Reads the next variable=value pair from MAP.
1517 Looks up the variable in DICT and stores it into *VAR.
1518 Stores a null-terminated value into *VALUE. */
1520 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1521 struct variable_to_value_map *map,
1522 struct variable **var, char **value,
1525 int max_warnings = 5;
1529 struct substring short_name_ss, value_ss;
1531 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1532 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1535 if (*warning_cnt > max_warnings)
1536 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1537 *warning_cnt - max_warnings);
1541 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1542 ss_buffer ("\t\0", 2));
1544 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1545 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1548 if (++*warning_cnt <= 5)
1549 sys_warn (r, _("Variable map refers to unknown variable %s."),
1550 ss_data (short_name_ss));
1554 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1555 *value = ss_data (value_ss);
1563 /* Displays a corruption message. */
1565 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1570 ds_init_empty (&text);
1571 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1572 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1573 ds_put_vformat (&text, format, args);
1575 m.category = msg_class_to_category (class);
1576 m.severity = msg_class_to_severity (class);
1577 m.where.file_name = NULL;
1578 m.where.line_number = 0;
1579 m.text = ds_cstr (&text);
1584 /* Displays a warning for the current file position. */
1586 sys_warn (struct sfm_reader *r, const char *format, ...)
1590 va_start (args, format);
1591 sys_msg (r, MW, format, args);
1595 /* Displays an error for the current file position,
1596 marks it as in an error state,
1597 and aborts reading it using longjmp. */
1599 sys_error (struct sfm_reader *r, const char *format, ...)
1603 va_start (args, format);
1604 sys_msg (r, ME, format, args);
1608 longjmp (r->bail_out, 1);
1611 /* Reads BYTE_CNT bytes into BUF.
1612 Returns true if exactly BYTE_CNT bytes are successfully read.
1613 Aborts if an I/O error or a partial read occurs.
1614 If EOF_IS_OK, then an immediate end-of-file causes false to be
1615 returned; otherwise, immediate end-of-file causes an abort
1618 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1619 void *buf, size_t byte_cnt)
1621 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1622 if (bytes_read == byte_cnt)
1624 else if (ferror (r->file))
1625 sys_error (r, _("System error: %s."), strerror (errno));
1626 else if (!eof_is_ok || bytes_read != 0)
1627 sys_error (r, _("Unexpected end of file."));
1632 /* Reads BYTE_CNT into BUF.
1633 Aborts upon I/O error or if end-of-file is encountered. */
1635 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1637 read_bytes_internal (r, false, buf, byte_cnt);
1640 /* Reads BYTE_CNT bytes into BUF.
1641 Returns true if exactly BYTE_CNT bytes are successfully read.
1642 Returns false if an immediate end-of-file is encountered.
1643 Aborts if an I/O error or a partial read occurs. */
1645 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1647 return read_bytes_internal (r, true, buf, byte_cnt);
1650 /* Reads a 32-bit signed integer from R and returns its value in
1653 read_int32 (struct sfm_reader *r)
1656 read_bytes (r, int32, sizeof int32);
1657 return int32_to_native (r, int32);
1660 /* Reads a 64-bit floating-point number from R and returns its
1661 value in host format. */
1663 read_flt64 (struct sfm_reader *r)
1666 read_bytes (r, flt64, sizeof flt64);
1667 return flt64_to_double (r, flt64);
1670 /* Reads exactly SIZE - 1 bytes into BUFFER
1671 and stores a null byte into BUFFER[SIZE - 1]. */
1673 read_string (struct sfm_reader *r, char *buffer, size_t size)
1676 read_bytes (r, buffer, size - 1);
1677 buffer[size - 1] = '\0';
1680 /* Skips BYTES bytes forward in R. */
1682 skip_bytes (struct sfm_reader *r, size_t bytes)
1687 size_t chunk = MIN (sizeof buffer, bytes);
1688 read_bytes (r, buffer, chunk);
1693 /* Returns the value of the 32-bit signed integer at INT32,
1694 converted from the format used by R to the host format. */
1696 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1699 if (r->integer_format == INTEGER_NATIVE)
1700 memcpy (&x, int32, sizeof x);
1702 x = integer_get (r->integer_format, int32, sizeof x);
1706 /* Returns the value of the 64-bit floating point number at
1707 FLT64, converted from the format used by R to the host
1710 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1713 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1714 memcpy (&x, flt64, sizeof x);
1716 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);
1720 static struct casereader_class sys_file_casereader_class =
1722 sys_file_casereader_read,
1723 sys_file_casereader_destroy,