1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include "sys-file-reader.h"
22 #include "sys-file-private.h"
30 #include <libpspp/alloc.h>
31 #include <libpspp/assertion.h>
32 #include <libpspp/message.h>
33 #include <libpspp/compiler.h>
34 #include <libpspp/magic.h>
35 #include <libpspp/misc.h>
36 #include <libpspp/pool.h>
37 #include <libpspp/str.h>
38 #include <libpspp/hash.h>
39 #include <libpspp/array.h>
42 #include "dictionary.h"
43 #include "file-handle-def.h"
44 #include "file-name.h"
46 #include "missing-values.h"
47 #include "value-labels.h"
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
74 enum integer_format integer_format; /* On-disk integer format. */
75 enum float_format float_format; /* On-disk floating point format. */
76 int value_cnt; /* Number of 8-byte units per case. */
77 struct sfm_var *vars; /* Variables. */
78 size_t var_cnt; /* Number of variables. */
79 bool has_long_var_names; /* File has a long variable name map */
80 bool has_vls; /* File has one or more very long strings? */
83 bool compressed; /* File is compressed? */
84 double bias; /* Compression bias, usually 100.0. */
85 uint8_t opcodes[8]; /* Current block of opcodes. */
86 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
89 /* A variable in a system file. */
92 int width; /* 0=numeric, otherwise string width. */
93 int case_index; /* Index into case. */
96 static struct variable **make_var_by_value_idx (struct sfm_reader *,
98 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
102 static void sys_warn (struct sfm_reader *, const char *, ...)
103 PRINTF_FORMAT (2, 3);
105 static void sys_error (struct sfm_reader *, const char *, ...)
109 static void read_bytes (struct sfm_reader *, void *, size_t);
110 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
111 static int32_t read_int32 (struct sfm_reader *);
112 static double read_flt64 (struct sfm_reader *);
113 static void read_string (struct sfm_reader *, char *, size_t);
114 static void skip_bytes (struct sfm_reader *, size_t);
116 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
117 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
119 static struct variable_to_value_map *open_variable_to_value_map (
120 struct sfm_reader *, size_t size);
121 static void close_variable_to_value_map (struct sfm_reader *r,
122 struct variable_to_value_map *);
123 static bool read_variable_to_value_map (struct sfm_reader *,
125 struct variable_to_value_map *,
126 struct variable **var, char **value,
129 /* Dictionary reader. */
137 static void read_header (struct sfm_reader *, struct dictionary *,
138 int *weight_idx, int *claimed_value_cnt,
139 struct sfm_read_info *);
140 static void read_variable_record (struct sfm_reader *, struct dictionary *,
141 int *format_warning_cnt);
142 static void parse_format_spec (struct sfm_reader *, uint32_t,
143 enum which_format, struct variable *,
144 int *format_warning_cnt);
145 static void setup_weight (struct sfm_reader *, int weight_idx,
146 struct variable **var_by_value_idx,
147 struct dictionary *);
148 static void read_documents (struct sfm_reader *, struct dictionary *);
149 static void read_value_labels (struct sfm_reader *, struct dictionary *,
150 struct variable **var_by_value_idx);
152 static void read_extension_record (struct sfm_reader *, struct dictionary *);
153 static void read_machine_int32_info (struct sfm_reader *,
154 size_t size, size_t count);
155 static void read_machine_flt64_info (struct sfm_reader *,
156 size_t size, size_t count);
157 static void read_display_parameters (struct sfm_reader *,
158 size_t size, size_t count,
159 struct dictionary *);
160 static void read_long_var_name_map (struct sfm_reader *,
161 size_t size, size_t count,
162 struct dictionary *);
163 static void read_long_string_map (struct sfm_reader *,
164 size_t size, size_t count,
165 struct dictionary *);
168 /* Opens the system file designated by file handle FH for
169 reading. Reads the system file's dictionary into *DICT.
170 If INFO is non-null, then it receives additional info about the
173 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
174 struct sfm_read_info *info)
176 struct sfm_reader *volatile r = NULL;
177 struct variable **var_by_value_idx;
178 int format_warning_cnt = 0;
180 int claimed_value_cnt;
184 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
187 *dict = dict_create ();
189 /* Create and initialize reader. */
190 r = pool_create_container (struct sfm_reader, pool);
192 r->file = fn_open (fh_get_file_name (fh), "rb");
196 r->has_long_var_names = false;
197 r->opcode_idx = sizeof r->opcodes;
199 if (setjmp (r->bail_out))
201 sfm_close_reader (r);
202 dict_destroy (*dict);
209 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
210 fh_get_file_name (r->fh), strerror (errno));
211 longjmp (r->bail_out, 1);
215 read_header (r, *dict, &weight_idx, &claimed_value_cnt, info);
217 /* Read all the variable definition records. */
218 rec_type = read_int32 (r);
219 while (rec_type == 2)
221 read_variable_record (r, *dict, &format_warning_cnt);
222 rec_type = read_int32 (r);
225 /* Figure out the case format. */
226 var_by_value_idx = make_var_by_value_idx (r, *dict);
227 setup_weight (r, weight_idx, var_by_value_idx, *dict);
229 /* Read all the rest of the dictionary records. */
230 while (rec_type != 999)
235 read_value_labels (r, *dict, var_by_value_idx);
239 sys_error (r, _("Misplaced type 4 record."));
242 read_documents (r, *dict);
246 read_extension_record (r, *dict);
250 sys_error (r, _("Unrecognized record type %d."), rec_type);
252 rec_type = read_int32 (r);
256 if ( ! r->has_long_var_names )
259 for (i = 0; i < dict_get_var_cnt (*dict); i++)
261 struct variable *var = dict_get_var (*dict, i);
262 char short_name [SHORT_NAME_LEN + 1];
263 char long_name [SHORT_NAME_LEN + 1];
265 strcpy (short_name, var_get_name (var));
267 strcpy (long_name, short_name);
268 str_lowercase (long_name);
270 /* Set long name. Renaming a variable may clear the short
271 name, but we want to retain it, so re-set it
273 dict_rename_var (*dict, var, long_name);
274 var_set_short_name (var, short_name);
277 r->has_long_var_names = true;
280 /* Read record 999 data, which is just filler. */
283 if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt)
284 sys_warn (r, _("File header claims %d variable positions but "
285 "%d were read from file."),
286 claimed_value_cnt, r->value_cnt);
288 /* Create an index of dictionary variable widths for
289 sfm_read_case to use. We cannot use the `struct variable's
290 from the dictionary we created, because the caller owns the
291 dictionary and may destroy or modify its variables. */
292 r->var_cnt = dict_get_var_cnt (*dict);
293 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
294 for (i = 0; i < r->var_cnt; i++)
296 struct variable *v = dict_get_var (*dict, i);
297 struct sfm_var *sv = &r->vars[i];
298 sv->width = var_get_width (v);
299 sv->case_index = var_get_case_index (v);
302 pool_free (r->pool, var_by_value_idx);
306 /* Closes a system file after we're done with it. */
308 sfm_close_reader (struct sfm_reader *r)
315 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
316 msg (ME, _("Error closing system file \"%s\": %s."),
317 fh_get_file_name (r->fh), strerror (errno));
322 fh_close (r->fh, "system file", "rs");
324 pool_destroy (r->pool);
327 /* Returns true if an I/O error has occurred on READER, false
330 sfm_read_error (const struct sfm_reader *reader)
332 return reader->error;
335 /* Returns true if FILE is an SPSS system file,
338 sfm_detect (FILE *file)
342 if (fread (rec_type, 4, 1, file) != 1)
346 return !strcmp ("$FL2", rec_type);
349 /* Reads the global header of the system file.
350 Sets DICT's file label to the system file's label.
351 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
352 or to the value index of the weight variable otherwise.
353 Sets *CLAIMED_VALUE_CNT to the number of values that the file
354 claims to have (although it is not always correct).
355 If INFO is non-null, initializes *INFO with header
358 read_header (struct sfm_reader *r, struct dictionary *dict,
359 int *weight_idx, int *claimed_value_cnt,
360 struct sfm_read_info *info)
363 char eye_catcher[61];
364 uint8_t raw_layout_code[4];
367 char creation_date[10];
368 char creation_time[9];
370 struct substring file_label_ss;
372 read_string (r, rec_type, sizeof rec_type);
373 read_string (r, eye_catcher, sizeof eye_catcher);
375 if (strcmp ("$FL2", rec_type) != 0)
376 sys_error (r, _("This is not an SPSS system file."));
378 /* Identify integer format. */
379 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
380 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
382 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
384 || (r->integer_format != INTEGER_MSB_FIRST
385 && r->integer_format != INTEGER_LSB_FIRST))
386 sys_error (r, _("This is not an SPSS system file."));
388 *claimed_value_cnt = read_int32 (r);
389 if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16)
390 *claimed_value_cnt = -1;
392 r->compressed = read_int32 (r) != 0;
394 *weight_idx = read_int32 (r);
396 case_cnt = read_int32 (r);
397 if (case_cnt < -1 || case_cnt > INT_MAX / 2)
400 /* Identify floating-point format and obtain compression bias. */
401 read_bytes (r, raw_bias, sizeof raw_bias);
402 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
404 sys_warn (r, _("Compression bias (%g) is not the usual "
405 "value of 100, or system file uses unrecognized "
406 "floating-point format."),
408 if (r->integer_format == INTEGER_MSB_FIRST)
409 r->float_format = FLOAT_IEEE_DOUBLE_BE;
411 r->float_format = FLOAT_IEEE_DOUBLE_LE;
413 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
415 read_string (r, creation_date, sizeof creation_date);
416 read_string (r, creation_time, sizeof creation_time);
417 read_string (r, file_label, sizeof file_label);
420 file_label_ss = ss_cstr (file_label);
421 ss_trim (&file_label_ss, ss_cstr (" "));
422 if (!ss_is_empty (file_label_ss))
424 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
425 dict_set_label (dict, ss_data (file_label_ss));
430 struct substring product;
432 strcpy (info->creation_date, creation_date);
433 strcpy (info->creation_time, creation_time);
434 info->integer_format = r->integer_format;
435 info->float_format = r->float_format;
436 info->compressed = r->compressed;
437 info->case_cnt = case_cnt;
439 product = ss_cstr (eye_catcher);
440 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
441 ss_trim (&product, ss_cstr (" "));
442 str_copy_buf_trunc (info->product, sizeof info->product,
443 ss_data (product), ss_length (product));
447 /* Reads a variable (type 2) record from R and adds the
448 corresponding variable to DICT.
449 Also skips past additional variable records for long string
452 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
453 int *format_warning_cnt)
456 int has_variable_label;
457 int missing_value_code;
462 struct variable *var;
465 width = read_int32 (r);
466 has_variable_label = read_int32 (r);
467 missing_value_code = read_int32 (r);
468 print_format = read_int32 (r);
469 write_format = read_int32 (r);
470 read_string (r, name, sizeof name);
471 name[strcspn (name, " ")] = '\0';
473 /* Check variable name. */
474 if (name[0] == '$' || name[0] == '#')
475 sys_error (r, "Variable name begins with invalid character `%c'.",
477 if (!var_is_plausible_name (name, false))
478 sys_error (r, _("Invalid variable name `%s'."), name);
480 /* Create variable. */
481 if (width < 0 || width > 255)
482 sys_error (r, _("Bad variable width %d."), width);
483 var = dict_create_var (dict, name, width);
486 _("Duplicate variable name `%s' within system file."),
489 /* Set the short name the same as the long name */
490 var_set_short_name (var, var_get_name (var));
492 /* Get variable label, if any. */
493 if (has_variable_label != 0 && has_variable_label != 1)
494 sys_error (r, _("Variable label indicator field is not 0 or 1."));
495 if (has_variable_label == 1)
500 len = read_int32 (r);
501 if (len >= sizeof label)
502 sys_error (r, _("Variable %s has label of invalid length %u."),
503 name, (unsigned int) len);
504 read_string (r, label, len + 1);
505 var_set_label (var, label);
507 skip_bytes (r, ROUND_UP (len, 4) - len);
510 /* Set missing values. */
511 if (missing_value_code < -3 || missing_value_code > 3
512 || missing_value_code == -1)
513 sys_error (r, _("Missing value indicator field is not "
514 "-3, -2, 0, 1, 2, or 3."));
515 if (missing_value_code != 0)
517 struct missing_values mv;
518 mv_init (&mv, var_get_width (var));
519 if (var_is_numeric (var))
521 if (missing_value_code > 0)
524 for (i = 0; i < missing_value_code; i++)
525 mv_add_num (&mv, read_flt64 (r));
529 double low = read_flt64 (r);
530 double high = read_flt64 (r);
531 mv_add_num_range (&mv, low, high);
532 if (missing_value_code == -3)
533 mv_add_num (&mv, read_flt64 (r));
536 else if (var_get_width (var) <= MAX_SHORT_STRING)
538 if (missing_value_code > 0)
541 for (i = 0; i < missing_value_code; i++)
544 read_string (r, string, sizeof string);
545 mv_add_str (&mv, string);
549 sys_error (r, _("String variable %s may not have missing "
550 "values specified as a range."),
553 else /* var->width > MAX_SHORT_STRING */
554 sys_error (r, _("Long string variable %s may not have missing "
557 var_set_missing_values (var, &mv);
561 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
562 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
564 /* Account for values.
565 Skip long string continuation records, if any. */
566 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
572 for (i = 1; i < nv; i++)
574 /* Check for record type 2 and width -1. */
575 if (read_int32 (r) != 2 || read_int32 (r) != -1)
576 sys_error (r, _("Missing string continuation record."));
578 /* Skip and ignore remaining continuation data. */
579 has_variable_label = read_int32 (r);
580 missing_value_code = read_int32 (r);
581 print_format = read_int32 (r);
582 write_format = read_int32 (r);
583 read_string (r, name, sizeof name);
585 /* Variable label fields on continuation records have
586 been spotted in system files created by "SPSS Power
587 Macintosh Release 6.1". */
588 if (has_variable_label)
589 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
594 /* Translates the format spec from sysfile format to internal
597 parse_format_spec (struct sfm_reader *r, uint32_t s,
598 enum which_format which, struct variable *v,
599 int *format_warning_cnt)
601 const int max_format_warnings = 8;
603 uint8_t raw_type = s >> 16;
609 if (!fmt_from_io (raw_type, &f.type))
610 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
615 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
620 if (which == PRINT_FORMAT)
621 var_set_print_format (v, &f);
623 var_set_write_format (v, &f);
625 else if (*++format_warning_cnt <= max_format_warnings)
627 char fmt_string[FMT_STRING_LEN_MAX + 1];
628 sys_warn (r, _("%s variable %s has invalid %s format %s."),
629 var_is_numeric (v) ? _("Numeric") : _("String"),
631 which == PRINT_FORMAT ? _("print") : _("write"),
632 fmt_to_string (&f, fmt_string));
634 if (*format_warning_cnt == max_format_warnings)
635 sys_warn (r, _("Suppressing further invalid format warnings."));
639 /* Sets the weighting variable in DICT to the variable
640 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
643 setup_weight (struct sfm_reader *r, int weight_idx,
644 struct variable **var_by_value_idx, struct dictionary *dict)
648 struct variable *weight_var
649 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
650 if (var_is_numeric (weight_var))
651 dict_set_weight (dict, weight_var);
653 sys_error (r, _("Weighting variable must be numeric."));
657 /* Reads a document record, type 6, from system file R, and sets up
658 the documents and n_documents fields in the associated
661 read_documents (struct sfm_reader *r, struct dictionary *dict)
666 if (dict_get_documents (dict) != NULL)
667 sys_error (r, _("Multiple type 6 (document) records."));
669 line_cnt = read_int32 (r);
671 sys_error (r, _("Number of document lines (%d) "
672 "must be greater than 0."), line_cnt);
674 documents = pool_nmalloc (r->pool, line_cnt + 1, 80);
675 read_string (r, documents, 80 * line_cnt + 1);
676 dict_set_documents (dict, documents);
677 pool_free (r->pool, documents);
680 /* Read a type 7 extension record. */
682 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
684 int subtype = read_int32 (r);
685 size_t size = read_int32 (r);
686 size_t count = read_int32 (r);
687 size_t bytes = size * count;
689 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
690 allows an extra byte for a null terminator, used by some
691 extension processing routines. */
692 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
693 sys_error (r, "Record type 7 subtype %d too large.", subtype);
698 read_machine_int32_info (r, size, count);
702 read_machine_flt64_info (r, size, count);
706 /* Variable sets information. We don't use these yet.
707 They only apply to GUIs; see VARSETS on the APPLY
708 DICTIONARY command in SPSS documentation. */
712 /* DATE variable information. We don't use it yet, but we
717 /* Unknown purpose. */
721 read_display_parameters (r, size, count, dict);
725 read_long_var_name_map (r, size, count, dict);
729 read_long_string_map (r, size, count, dict);
733 /* New in SPSS v14? Unknown purpose. */
737 /* Text field that defines variable attributes. New in
742 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
746 skip_bytes (r, bytes);
749 /* Read record type 7, subtype 3. */
751 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
753 int version_major UNUSED = read_int32 (r);
754 int version_minor UNUSED = read_int32 (r);
755 int version_revision UNUSED = read_int32 (r);
756 int machine_code UNUSED = read_int32 (r);
757 int float_representation = read_int32 (r);
758 int compression_code UNUSED = read_int32 (r);
759 int integer_representation = read_int32 (r);
760 int character_code UNUSED = read_int32 (r);
762 int expected_float_format;
763 int expected_integer_format;
765 if (size != 4 || count != 8)
766 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
768 (unsigned int) size, (unsigned int) count);
770 /* Check floating point format. */
771 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
772 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
773 expected_float_format = 1;
774 else if (r->float_format == FLOAT_Z_LONG)
775 expected_float_format = 2;
776 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
777 expected_float_format = 3;
780 if (float_representation != expected_float_format)
781 sys_error (r, _("Floating-point representation indicated by "
782 "system file (%d) differs from expected (%d)."),
783 r->float_format, expected_float_format);
785 /* Check integer format. */
786 if (r->integer_format == INTEGER_MSB_FIRST)
787 expected_integer_format = 1;
788 else if (r->integer_format == INTEGER_LSB_FIRST)
789 expected_integer_format = 2;
792 if (integer_representation != expected_integer_format)
794 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
795 sys_warn (r, _("Integer format indicated by system file (%s) "
796 "differs from expected (%s)."),
797 gettext (endian[integer_representation == 1]),
798 gettext (endian[expected_integer_format == 1]));
802 /* Read record type 7, subtype 4. */
804 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
806 double sysmis = read_flt64 (r);
807 double highest = read_flt64 (r);
808 double lowest = read_flt64 (r);
810 if (size != 8 || count != 3)
811 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
812 (unsigned int) size, (unsigned int) count);
814 if (sysmis != SYSMIS)
815 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
816 if (highest != HIGHEST)
817 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
818 if (lowest != LOWEST)
819 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
822 /* Read record type 7, subtype 11, which specifies how variables
823 should be displayed in GUI environments. */
825 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
826 struct dictionary *dict)
828 const size_t n_vars = count / 3 ;
832 if (count % 3 || n_vars != dict_get_var_cnt (dict))
833 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
834 (unsigned int) size, (unsigned int) count);
836 for (i = 0; i < n_vars; ++i)
838 int measure = read_int32 (r);
839 int width = read_int32 (r);
840 int align = read_int32 (r);
841 struct variable *v = dict_get_var (dict, i);
843 /* spss v14 sometimes seems to set string variables' measure to zero */
844 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
847 if (measure < 1 || measure > 3 || align < 0 || align > 2)
850 sys_warn (r, _("Invalid variable display parameters. "
851 "Default parameters substituted."));
856 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
857 : measure == 2 ? MEASURE_ORDINAL
859 var_set_display_width (v, width);
860 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
861 : align == 1 ? ALIGN_RIGHT
866 /* Reads record type 7, subtype 13, which gives the long name
867 that corresponds to each short name. Modifies variable names
868 in DICT accordingly. */
870 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
871 struct dictionary *dict)
873 struct variable_to_value_map *map;
874 struct variable *var;
878 map = open_variable_to_value_map (r, size * count);
879 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
882 char short_name[SHORT_NAME_LEN + 1];
883 strcpy (short_name, var_get_short_name (var));
885 /* Validate long name. */
886 if (!var_is_valid_name (long_name, false))
888 sys_warn (r, _("Long variable mapping from %s to invalid "
889 "variable name `%s'."),
890 var_get_name (var), long_name);
894 /* Identify any duplicates. */
895 if (strcasecmp (short_name, long_name)
896 && dict_lookup_var (dict, long_name) != NULL)
898 sys_warn (r, _("Duplicate long variable name `%s' "
899 "within system file."), long_name);
903 /* Set long name. Renaming a variable may clear the short
904 name, but we want to retain it, so re-set it
906 dict_rename_var (dict, var, long_name);
907 var_set_short_name (var, short_name);
909 close_variable_to_value_map (r, map);
910 r->has_long_var_names = true;
913 /* Reads record type 7, subtype 14, which gives the real length
914 of each very long string. Rearranges DICT accordingly. */
916 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
917 struct dictionary *dict)
919 struct variable_to_value_map *map;
920 struct variable *var;
926 map = open_variable_to_value_map (r, size * count);
927 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
930 long length, remaining_length;
934 length = strtol (length_s, NULL, 10);
935 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
937 sys_warn (r, _("%s listed as string of length %s "
939 var_get_name (var), length_s);
943 /* Group multiple variables into single variable
944 and delete all but the first. */
945 remaining_length = length;
946 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
947 if (idx < dict_get_var_cnt (dict))
948 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
949 EFFECTIVE_LONG_STRING_LENGTH);
951 sys_error (r, _("Very long string %s overflows dictionary."),
953 dict_delete_consecutive_vars (dict,
954 var_get_dict_index (var) + 1,
955 idx - var_get_dict_index (var) - 1);
957 /* Assign all the length to the first variable. */
958 var_set_width (var, length);
960 close_variable_to_value_map (r, map);
961 dict_compact_values (dict);
964 /* Reads value labels from sysfile H and inserts them into the
965 associated dictionary. */
967 read_value_labels (struct sfm_reader *r,
968 struct dictionary *dict, struct variable **var_by_value_idx)
970 struct pool *subpool;
974 char raw_value[8]; /* Value as uninterpreted bytes. */
975 union value value; /* Value. */
976 char *label; /* Null-terminated label string. */
979 struct label *labels = NULL;
980 int label_cnt; /* Number of labels. */
982 struct variable **var = NULL; /* Associated variables. */
983 int var_cnt; /* Number of associated variables. */
987 subpool = pool_create_subpool (r->pool);
989 /* Read the type 3 record and record its contents. We can't do
990 much with the data yet because we don't know whether it is
991 of numeric or string type. */
993 /* Read number of labels. */
994 label_cnt = read_int32 (r);
996 if (label_cnt >= INT32_MAX / sizeof *labels)
998 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1003 /* Read each value/label tuple into labels[]. */
1004 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1005 for (i = 0; i < label_cnt; i++)
1007 struct label *label = labels + i;
1008 unsigned char label_len;
1012 read_bytes (r, label->raw_value, sizeof label->raw_value);
1014 /* Read label length. */
1015 read_bytes (r, &label_len, sizeof label_len);
1016 padded_len = ROUND_UP (label_len + 1, 8);
1018 /* Read label, padding. */
1019 label->label = pool_alloc (subpool, padded_len + 1);
1020 read_bytes (r, label->label, padded_len - 1);
1021 label->label[label_len] = 0;
1024 /* Now, read the type 4 record that has the list of variables
1025 to which the value labels are to be applied. */
1027 /* Read record type of type 4 record. */
1028 if (read_int32 (r) != 4)
1029 sys_error (r, _("Variable index record (type 4) does not immediately "
1030 "follow value label record (type 3) as it should."));
1032 /* Read number of variables associated with value label from type 4
1034 var_cnt = read_int32 (r);
1035 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1036 sys_error (r, _("Number of variables associated with a value label (%d) "
1037 "is not between 1 and the number of variables (%u)."),
1038 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1040 /* Read the list of variables. */
1041 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1042 for (i = 0; i < var_cnt; i++)
1044 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1045 if (var_is_long_string (var[i]))
1046 sys_error (r, _("Value labels are not allowed on long string "
1047 "variables (%s)."), var_get_name (var[i]));
1050 /* Type check the variables. */
1051 for (i = 1; i < var_cnt; i++)
1052 if (var_get_type (var[i]) != var_get_type (var[0]))
1053 sys_error (r, _("Variables associated with value label are not all of "
1054 "identical type. Variable %s is %s, but variable "
1056 var_get_name (var[0]),
1057 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1058 var_get_name (var[i]),
1059 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1061 /* Fill in labels[].value, now that we know the desired type. */
1062 for (i = 0; i < label_cnt; i++)
1064 struct label *label = labels + i;
1066 if (var_is_alpha (var[0]))
1067 buf_copy_rpad (label->value.s, sizeof label->value.s,
1068 label->raw_value, sizeof label->raw_value);
1070 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1073 /* Assign the `value_label's to each variable. */
1074 for (i = 0; i < var_cnt; i++)
1076 struct variable *v = var[i];
1079 /* Add each label to the variable. */
1080 for (j = 0; j < label_cnt; j++)
1082 struct label *label = &labels[j];
1083 if (!var_add_value_label (v, &label->value, label->label))
1085 if (var_is_numeric (var[0]))
1086 sys_warn (r, _("Duplicate value label for %g on %s."),
1087 label->value.f, var_get_name (v));
1089 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1090 var_get_width (v), label->value.s,
1096 pool_destroy (subpool);
1101 static void partial_record (struct sfm_reader *r)
1103 static bool read_case_number (struct sfm_reader *, double *);
1104 static bool read_case_string (struct sfm_reader *, char *, size_t);
1105 static int read_opcode (struct sfm_reader *);
1106 static bool read_compressed_number (struct sfm_reader *, double *);
1107 static bool read_compressed_string (struct sfm_reader *, char *);
1108 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1110 /* Reads one case from READER's file into C. Returns nonzero
1111 only if successful. */
1113 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1118 if (setjmp (r->bail_out))
1121 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1123 /* Fast path. Read the whole case directly. */
1124 if (!try_read_bytes (r, case_data_all_rw (c),
1125 sizeof (union value) * r->value_cnt))
1128 /* Convert floating point numbers to native format if needed. */
1129 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1133 for (i = 0; i < r->var_cnt; i++)
1134 if (r->vars[i].width == 0)
1136 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1137 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1144 /* Slow path. Convert from external to internal format. */
1147 for (i = 0; i < r->var_cnt; i++)
1149 struct sfm_var *sv = &r->vars[i];
1150 union value *v = case_data_rw_idx (c, sv->case_index);
1154 if (!read_case_number (r, &v->f))
1159 /* Read the string data in segments up to 255 bytes
1160 at a time, packed into 8-byte units. */
1161 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1162 int ofs, chunk_size;
1163 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1165 chunk_size = MIN (max_chunk, sv->width - ofs);
1166 if (!read_case_string (r, v->s + ofs, chunk_size))
1174 /* Very long strings have trailing wasted space
1175 that we must skip. */
1176 if (sv->width >= MIN_VERY_LONG_STRING)
1178 int bytes_read = (sv->width / max_chunk * 256
1179 + ROUND_UP (sv->width % max_chunk, 8));
1180 int total_bytes = sfm_width_to_bytes (sv->width);
1181 int excess_bytes = total_bytes - bytes_read;
1183 while (excess_bytes > 0)
1186 size_t chunk = MIN (sizeof buffer, excess_bytes);
1187 if (!read_whole_strings (r, buffer, chunk))
1189 excess_bytes -= chunk;
1203 /* Issues an error that R ends in a partial record. */
1205 partial_record (struct sfm_reader *r)
1207 sys_error (r, _("File ends in partial case."));
1210 /* Reads a number from R and stores its value in *D.
1211 If R is compressed, reads a compressed number;
1212 otherwise, reads a number in the regular way.
1213 Returns true if successful, false if end of file is
1214 reached immediately. */
1216 read_case_number (struct sfm_reader *r, double *d)
1221 if (!try_read_bytes (r, flt64, sizeof flt64))
1223 *d = flt64_to_double (r, flt64);
1227 return read_compressed_number (r, d);
1230 /* Reads LENGTH string bytes from R into S.
1231 Always reads a multiple of 8 bytes; if LENGTH is not a
1232 multiple of 8, then extra bytes are read and discarded without
1234 Reads compressed strings if S is compressed.
1235 Returns true if successful, false if end of file is
1236 reached immediately. */
1238 read_case_string (struct sfm_reader *r, char *s, size_t length)
1240 size_t whole = ROUND_DOWN (length, 8);
1241 size_t partial = length % 8;
1245 if (!read_whole_strings (r, s, whole))
1252 if (!read_whole_strings (r, bounce, sizeof bounce))
1258 memcpy (s + whole, bounce, partial);
1264 /* Reads and returns the next compression opcode from R. */
1266 read_opcode (struct sfm_reader *r)
1268 assert (r->compressed);
1272 if (r->opcode_idx >= sizeof r->opcodes)
1274 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1278 opcode = r->opcodes[r->opcode_idx++];
1285 /* Reads a compressed number from R and stores its value in D.
1286 Returns true if successful, false if end of file is
1287 reached immediately. */
1289 read_compressed_number (struct sfm_reader *r, double *d)
1291 int opcode = read_opcode (r);
1299 *d = read_flt64 (r);
1303 sys_error (r, _("Compressed data is corrupt."));
1310 *d = opcode - r->bias;
1317 /* Reads a compressed 8-byte string segment from R and stores it
1319 Returns true if successful, false if end of file is
1320 reached immediately. */
1322 read_compressed_string (struct sfm_reader *r, char *dst)
1324 switch (read_opcode (r))
1331 read_bytes (r, dst, 8);
1335 memset (dst, ' ', 8);
1339 sys_error (r, _("Compressed data is corrupt."));
1345 /* Reads LENGTH string bytes from R into S.
1346 LENGTH must be a multiple of 8.
1347 Reads compressed strings if S is compressed.
1348 Returns true if successful, false if end of file is
1349 reached immediately. */
1351 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1353 assert (length % 8 == 0);
1355 return try_read_bytes (r, s, length);
1359 for (ofs = 0; ofs < length; ofs += 8)
1360 if (!read_compressed_string (r, s + ofs))
1370 /* Creates and returns a table that can be used for translating a value
1371 index into a case to a "struct variable *" for DICT. Multiple
1372 system file fields reference variables this way.
1374 This table must be created before processing the very long
1375 string extension record, because that record causes some
1376 values to be deleted from the case and the dictionary to be
1378 static struct variable **
1379 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1381 struct variable **var_by_value_idx;
1385 var_by_value_idx = pool_nmalloc (r->pool,
1386 r->value_cnt, sizeof *var_by_value_idx);
1387 for (i = 0; i < dict_get_var_cnt (dict); i++)
1389 struct variable *v = dict_get_var (dict, i);
1390 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1393 var_by_value_idx[value_idx++] = v;
1394 for (j = 1; j < nv; j++)
1395 var_by_value_idx[value_idx++] = NULL;
1397 assert (value_idx == r->value_cnt);
1399 return var_by_value_idx;
1402 /* Returns the "struct variable" corresponding to the given
1403 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1405 static struct variable *
1406 lookup_var_by_value_idx (struct sfm_reader *r,
1407 struct variable **var_by_value_idx, int value_idx)
1409 struct variable *var;
1411 if (value_idx < 1 || value_idx > r->value_cnt)
1412 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1413 value_idx, r->value_cnt);
1415 var = var_by_value_idx[value_idx - 1];
1417 sys_error (r, _("Variable index %d refers to long string "
1424 /* Returns the variable in D with the given SHORT_NAME,
1425 or a null pointer if there is none. */
1426 static struct variable *
1427 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1429 struct variable *var;
1433 /* First try looking up by full name. This often succeeds. */
1434 var = dict_lookup_var (d, short_name);
1435 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1438 /* Iterate through the whole dictionary as a fallback. */
1439 var_cnt = dict_get_var_cnt (d);
1440 for (i = 0; i < var_cnt; i++)
1442 var = dict_get_var (d, i);
1443 if (!strcasecmp (var_get_short_name (var), short_name))
1450 /* Helpers for reading records that contain "variable=value"
1454 struct variable_to_value_map
1456 struct substring buffer; /* Record contents. */
1457 size_t pos; /* Current position in buffer. */
1460 /* Reads SIZE bytes into a "variable=value" map for R,
1461 and returns the map. */
1462 static struct variable_to_value_map *
1463 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1465 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1466 char *buffer = pool_malloc (r->pool, size + 1);
1467 read_bytes (r, buffer, size);
1468 map->buffer = ss_buffer (buffer, size);
1473 /* Closes MAP and frees its storage.
1474 Not really needed, because the pool will free the map anyway,
1475 but can be used to free it earlier. */
1477 close_variable_to_value_map (struct sfm_reader *r,
1478 struct variable_to_value_map *map)
1480 pool_free (r->pool, ss_data (map->buffer));
1483 /* Reads the next variable=value pair from MAP.
1484 Looks up the variable in DICT and stores it into *VAR.
1485 Stores a null-terminated value into *VALUE. */
1487 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1488 struct variable_to_value_map *map,
1489 struct variable **var, char **value,
1492 int max_warnings = 5;
1496 struct substring short_name_ss, value_ss;
1498 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1499 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1502 if (*warning_cnt > max_warnings)
1503 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1504 *warning_cnt - max_warnings);
1508 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1509 ss_buffer ("\t\0", 2));
1511 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1512 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1515 if (++*warning_cnt <= 5)
1516 sys_warn (r, _("Variable map refers to unknown variable %s."),
1517 ss_data (short_name_ss));
1521 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1522 *value = ss_data (value_ss);
1530 /* Displays a corruption message. */
1532 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1537 ds_init_empty (&text);
1538 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1539 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1540 ds_put_vformat (&text, format, args);
1542 m.category = msg_class_to_category (class);
1543 m.severity = msg_class_to_severity (class);
1544 m.where.file_name = NULL;
1545 m.where.line_number = 0;
1546 m.text = ds_cstr (&text);
1551 /* Displays a warning for the current file position. */
1553 sys_warn (struct sfm_reader *r, const char *format, ...)
1557 va_start (args, format);
1558 sys_msg (r, MW, format, args);
1562 /* Displays an error for the current file position,
1563 marks it as in an error state,
1564 and aborts reading it using longjmp. */
1566 sys_error (struct sfm_reader *r, const char *format, ...)
1570 va_start (args, format);
1571 sys_msg (r, ME, format, args);
1575 longjmp (r->bail_out, 1);
1578 /* Reads BYTE_CNT bytes into BUF.
1579 Returns true if exactly BYTE_CNT bytes are successfully read.
1580 Aborts if an I/O error or a partial read occurs.
1581 If EOF_IS_OK, then an immediate end-of-file causes false to be
1582 returned; otherwise, immediate end-of-file causes an abort
1585 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1586 void *buf, size_t byte_cnt)
1588 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1589 if (bytes_read == byte_cnt)
1591 else if (ferror (r->file))
1592 sys_error (r, _("System error: %s."), strerror (errno));
1593 else if (!eof_is_ok || bytes_read != 0)
1594 sys_error (r, _("Unexpected end of file."));
1599 /* Reads BYTE_CNT into BUF.
1600 Aborts upon I/O error or if end-of-file is encountered. */
1602 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1604 read_bytes_internal (r, false, buf, byte_cnt);
1607 /* Reads BYTE_CNT bytes into BUF.
1608 Returns true if exactly BYTE_CNT bytes are successfully read.
1609 Returns false if an immediate end-of-file is encountered.
1610 Aborts if an I/O error or a partial read occurs. */
1612 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1614 return read_bytes_internal (r, true, buf, byte_cnt);
1617 /* Reads a 32-bit signed integer from R and returns its value in
1620 read_int32 (struct sfm_reader *r)
1623 read_bytes (r, int32, sizeof int32);
1624 return int32_to_native (r, int32);
1627 /* Reads a 64-bit floating-point number from R and returns its
1628 value in host format. */
1630 read_flt64 (struct sfm_reader *r)
1633 read_bytes (r, flt64, sizeof flt64);
1634 return flt64_to_double (r, flt64);
1637 /* Reads exactly SIZE - 1 bytes into BUFFER
1638 and stores a null byte into BUFFER[SIZE - 1]. */
1640 read_string (struct sfm_reader *r, char *buffer, size_t size)
1643 read_bytes (r, buffer, size - 1);
1644 buffer[size - 1] = '\0';
1647 /* Skips BYTES bytes forward in R. */
1649 skip_bytes (struct sfm_reader *r, size_t bytes)
1654 size_t chunk = MIN (sizeof buffer, bytes);
1655 read_bytes (r, buffer, chunk);
1660 /* Returns the value of the 32-bit signed integer at INT32,
1661 converted from the format used by R to the host format. */
1663 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1666 if (r->integer_format == INTEGER_NATIVE)
1667 memcpy (&x, int32, sizeof x);
1669 x = integer_get (r->integer_format, int32, sizeof x);
1673 /* Returns the value of the 64-bit floating point number at
1674 FLT64, converted from the format used by R to the host
1677 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1680 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1681 memcpy (&x, flt64, sizeof x);
1683 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);