1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include "sys-file-reader.h"
22 #include "sys-file-private.h"
30 #include <libpspp/alloc.h>
31 #include <libpspp/assertion.h>
32 #include <libpspp/message.h>
33 #include <libpspp/compiler.h>
34 #include <libpspp/magic.h>
35 #include <libpspp/misc.h>
36 #include <libpspp/pool.h>
37 #include <libpspp/str.h>
38 #include <libpspp/hash.h>
39 #include <libpspp/array.h>
42 #include "dictionary.h"
43 #include "file-handle-def.h"
44 #include "file-name.h"
46 #include "missing-values.h"
47 #include "value-labels.h"
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
74 enum integer_format integer_format; /* On-disk integer format. */
75 enum float_format float_format; /* On-disk floating point format. */
76 int value_cnt; /* Number of 8-byte units per case. */
77 struct sfm_var *vars; /* Variables. */
78 size_t var_cnt; /* Number of variables. */
79 bool has_long_var_names; /* File has a long variable name map */
80 bool has_vls; /* File has one or more very long strings? */
83 bool compressed; /* File is compressed? */
84 double bias; /* Compression bias, usually 100.0. */
85 uint8_t opcodes[8]; /* Current block of opcodes. */
86 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
89 /* A variable in a system file. */
92 int width; /* 0=numeric, otherwise string width. */
93 int case_index; /* Index into case. */
96 static struct variable **make_var_by_value_idx (struct sfm_reader *,
98 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
102 static void sys_warn (struct sfm_reader *, const char *, ...)
103 PRINTF_FORMAT (2, 3);
105 static void sys_error (struct sfm_reader *, const char *, ...)
109 static void read_bytes (struct sfm_reader *, void *, size_t);
110 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
111 static int32_t read_int32 (struct sfm_reader *);
112 static double read_flt64 (struct sfm_reader *);
113 static void read_string (struct sfm_reader *, char *, size_t);
114 static void skip_bytes (struct sfm_reader *, size_t);
116 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
117 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
119 static struct variable_to_value_map *open_variable_to_value_map (
120 struct sfm_reader *, size_t size);
121 static void close_variable_to_value_map (struct sfm_reader *r,
122 struct variable_to_value_map *);
123 static bool read_variable_to_value_map (struct sfm_reader *,
125 struct variable_to_value_map *,
126 struct variable **var, char **value,
129 /* Dictionary reader. */
137 static void read_header (struct sfm_reader *, struct dictionary *,
138 int *weight_idx, int *claimed_value_cnt,
139 struct sfm_read_info *);
140 static void read_variable_record (struct sfm_reader *, struct dictionary *,
141 int *format_warning_cnt);
142 static void parse_format_spec (struct sfm_reader *, uint32_t,
143 enum which_format, struct variable *,
144 int *format_warning_cnt);
145 static void setup_weight (struct sfm_reader *, int weight_idx,
146 struct variable **var_by_value_idx,
147 struct dictionary *);
148 static void read_documents (struct sfm_reader *, struct dictionary *);
149 static void read_value_labels (struct sfm_reader *, struct dictionary *,
150 struct variable **var_by_value_idx);
152 static void read_extension_record (struct sfm_reader *, struct dictionary *);
153 static void read_machine_int32_info (struct sfm_reader *,
154 size_t size, size_t count);
155 static void read_machine_flt64_info (struct sfm_reader *,
156 size_t size, size_t count);
157 static void read_display_parameters (struct sfm_reader *,
158 size_t size, size_t count,
159 struct dictionary *);
160 static void read_long_var_name_map (struct sfm_reader *,
161 size_t size, size_t count,
162 struct dictionary *);
163 static void read_long_string_map (struct sfm_reader *,
164 size_t size, size_t count,
165 struct dictionary *);
168 /* Opens the system file designated by file handle FH for
169 reading. Reads the system file's dictionary into *DICT.
170 If INFO is non-null, then it receives additional info about the
173 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
174 struct sfm_read_info *info)
176 struct sfm_reader *volatile r = NULL;
177 struct variable **var_by_value_idx;
178 int format_warning_cnt = 0;
180 int claimed_value_cnt;
184 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
187 *dict = dict_create ();
189 /* Create and initialize reader. */
190 r = pool_create_container (struct sfm_reader, pool);
192 r->file = fn_open (fh_get_file_name (fh), "rb");
196 r->has_long_var_names = false;
197 r->opcode_idx = sizeof r->opcodes;
199 if (setjmp (r->bail_out))
201 sfm_close_reader (r);
202 dict_destroy (*dict);
209 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
210 fh_get_file_name (r->fh), strerror (errno));
211 longjmp (r->bail_out, 1);
215 read_header (r, *dict, &weight_idx, &claimed_value_cnt, info);
217 /* Read all the variable definition records. */
218 rec_type = read_int32 (r);
219 while (rec_type == 2)
221 read_variable_record (r, *dict, &format_warning_cnt);
222 rec_type = read_int32 (r);
225 /* Figure out the case format. */
226 var_by_value_idx = make_var_by_value_idx (r, *dict);
227 setup_weight (r, weight_idx, var_by_value_idx, *dict);
229 /* Read all the rest of the dictionary records. */
230 while (rec_type != 999)
235 read_value_labels (r, *dict, var_by_value_idx);
239 sys_error (r, _("Misplaced type 4 record."));
242 read_documents (r, *dict);
246 read_extension_record (r, *dict);
250 sys_error (r, _("Unrecognized record type %d."), rec_type);
252 rec_type = read_int32 (r);
256 if ( ! r->has_long_var_names )
259 for (i = 0; i < dict_get_var_cnt (*dict); i++)
261 struct variable *var = dict_get_var (*dict, i);
262 char short_name [SHORT_NAME_LEN + 1];
263 char long_name [SHORT_NAME_LEN + 1];
264 char *s = short_name;
267 strcpy (short_name, var_get_name (var));
269 strcpy (long_name, short_name);
270 str_lowercase (long_name);
272 /* Set long name. Renaming a variable may clear the short
273 name, but we want to retain it, so re-set it
275 dict_rename_var (*dict, var, long_name);
276 var_set_short_name (var, short_name);
279 r->has_long_var_names = true;
282 /* Read record 999 data, which is just filler. */
285 if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt)
286 sys_warn (r, _("File header claims %d variable positions but "
287 "%d were read from file."),
288 claimed_value_cnt, r->value_cnt);
290 /* Create an index of dictionary variable widths for
291 sfm_read_case to use. We cannot use the `struct variable's
292 from the dictionary we created, because the caller owns the
293 dictionary and may destroy or modify its variables. */
294 r->var_cnt = dict_get_var_cnt (*dict);
295 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
296 for (i = 0; i < r->var_cnt; i++)
298 struct variable *v = dict_get_var (*dict, i);
299 struct sfm_var *sv = &r->vars[i];
300 sv->width = var_get_width (v);
301 sv->case_index = var_get_case_index (v);
304 pool_free (r->pool, var_by_value_idx);
308 /* Closes a system file after we're done with it. */
310 sfm_close_reader (struct sfm_reader *r)
317 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
318 msg (ME, _("Error closing system file \"%s\": %s."),
319 fh_get_file_name (r->fh), strerror (errno));
324 fh_close (r->fh, "system file", "rs");
326 pool_destroy (r->pool);
329 /* Returns true if an I/O error has occurred on READER, false
332 sfm_read_error (const struct sfm_reader *reader)
334 return reader->error;
337 /* Returns true if FILE is an SPSS system file,
340 sfm_detect (FILE *file)
344 if (fread (rec_type, 4, 1, file) != 1)
348 return !strcmp ("$FL2", rec_type);
351 /* Reads the global header of the system file.
352 Sets DICT's file label to the system file's label.
353 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
354 or to the value index of the weight variable otherwise.
355 Sets *CLAIMED_VALUE_CNT to the number of values that the file
356 claims to have (although it is not always correct).
357 If INFO is non-null, initializes *INFO with header
360 read_header (struct sfm_reader *r, struct dictionary *dict,
361 int *weight_idx, int *claimed_value_cnt,
362 struct sfm_read_info *info)
365 char eye_catcher[61];
366 uint8_t raw_layout_code[4];
369 char creation_date[10];
370 char creation_time[9];
372 struct substring file_label_ss;
374 read_string (r, rec_type, sizeof rec_type);
375 read_string (r, eye_catcher, sizeof eye_catcher);
377 if (strcmp ("$FL2", rec_type) != 0)
378 sys_error (r, _("This is not an SPSS system file."));
380 /* Identify integer format. */
381 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
382 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
384 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
386 || (r->integer_format != INTEGER_MSB_FIRST
387 && r->integer_format != INTEGER_LSB_FIRST))
388 sys_error (r, _("This is not an SPSS system file."));
390 *claimed_value_cnt = read_int32 (r);
391 if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16)
392 *claimed_value_cnt = -1;
394 r->compressed = read_int32 (r) != 0;
396 *weight_idx = read_int32 (r);
398 case_cnt = read_int32 (r);
399 if (case_cnt < -1 || case_cnt > INT_MAX / 2)
402 /* Identify floating-point format and obtain compression bias. */
403 read_bytes (r, raw_bias, sizeof raw_bias);
404 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
406 sys_warn (r, _("Compression bias (%g) is not the usual "
407 "value of 100, or system file uses unrecognized "
408 "floating-point format."),
410 if (r->integer_format == INTEGER_MSB_FIRST)
411 r->float_format = FLOAT_IEEE_DOUBLE_BE;
413 r->float_format = FLOAT_IEEE_DOUBLE_LE;
415 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
417 read_string (r, creation_date, sizeof creation_date);
418 read_string (r, creation_time, sizeof creation_time);
419 read_string (r, file_label, sizeof file_label);
422 file_label_ss = ss_cstr (file_label);
423 ss_trim (&file_label_ss, ss_cstr (" "));
424 if (!ss_is_empty (file_label_ss))
426 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
427 dict_set_label (dict, ss_data (file_label_ss));
432 struct substring product;
434 strcpy (info->creation_date, creation_date);
435 strcpy (info->creation_time, creation_time);
436 info->integer_format = r->integer_format;
437 info->float_format = r->float_format;
438 info->compressed = r->compressed;
439 info->case_cnt = case_cnt;
441 product = ss_cstr (eye_catcher);
442 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
443 ss_trim (&product, ss_cstr (" "));
444 str_copy_buf_trunc (info->product, sizeof info->product,
445 ss_data (product), ss_length (product));
449 /* Reads a variable (type 2) record from R and adds the
450 corresponding variable to DICT.
451 Also skips past additional variable records for long string
454 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
455 int *format_warning_cnt)
458 int has_variable_label;
459 int missing_value_code;
464 struct variable *var;
467 width = read_int32 (r);
468 has_variable_label = read_int32 (r);
469 missing_value_code = read_int32 (r);
470 print_format = read_int32 (r);
471 write_format = read_int32 (r);
472 read_string (r, name, sizeof name);
473 name[strcspn (name, " ")] = '\0';
475 /* Check variable name. */
476 if (name[0] == '$' || name[0] == '#')
477 sys_error (r, "Variable name begins with invalid character `%c'.",
479 if (!var_is_plausible_name (name, false))
480 sys_error (r, _("Invalid variable name `%s'."), name);
482 /* Create variable. */
483 if (width < 0 || width > 255)
484 sys_error (r, _("Bad variable width %d."), width);
485 var = dict_create_var (dict, name, width);
488 _("Duplicate variable name `%s' within system file."),
491 /* Set the short name the same as the long name */
492 var_set_short_name (var, var_get_name (var));
494 /* Get variable label, if any. */
495 if (has_variable_label != 0 && has_variable_label != 1)
496 sys_error (r, _("Variable label indicator field is not 0 or 1."));
497 if (has_variable_label == 1)
502 len = read_int32 (r);
503 if (len >= sizeof label)
504 sys_error (r, _("Variable %s has label of invalid length %u."),
505 name, (unsigned int) len);
506 read_string (r, label, len + 1);
507 var_set_label (var, label);
509 skip_bytes (r, ROUND_UP (len, 4) - len);
512 /* Set missing values. */
513 if (missing_value_code < -3 || missing_value_code > 3
514 || missing_value_code == -1)
515 sys_error (r, _("Missing value indicator field is not "
516 "-3, -2, 0, 1, 2, or 3."));
517 if (missing_value_code != 0)
519 struct missing_values mv;
520 mv_init (&mv, var_get_width (var));
521 if (var_is_numeric (var))
523 if (missing_value_code > 0)
526 for (i = 0; i < missing_value_code; i++)
527 mv_add_num (&mv, read_flt64 (r));
531 double low = read_flt64 (r);
532 double high = read_flt64 (r);
533 mv_add_num_range (&mv, low, high);
534 if (missing_value_code == -3)
535 mv_add_num (&mv, read_flt64 (r));
538 else if (var_get_width (var) <= MAX_SHORT_STRING)
540 if (missing_value_code > 0)
543 for (i = 0; i < missing_value_code; i++)
546 read_string (r, string, sizeof string);
547 mv_add_str (&mv, string);
551 sys_error (r, _("String variable %s may not have missing "
552 "values specified as a range."),
555 else /* var->width > MAX_SHORT_STRING */
556 sys_error (r, _("Long string variable %s may not have missing "
559 var_set_missing_values (var, &mv);
563 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
564 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
566 /* Account for values.
567 Skip long string continuation records, if any. */
568 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
574 for (i = 1; i < nv; i++)
576 /* Check for record type 2 and width -1. */
577 if (read_int32 (r) != 2 || read_int32 (r) != -1)
578 sys_error (r, _("Missing string continuation record."));
580 /* Skip and ignore remaining continuation data. */
581 has_variable_label = read_int32 (r);
582 missing_value_code = read_int32 (r);
583 print_format = read_int32 (r);
584 write_format = read_int32 (r);
585 read_string (r, name, sizeof name);
587 /* Variable label fields on continuation records have
588 been spotted in system files created by "SPSS Power
589 Macintosh Release 6.1". */
590 if (has_variable_label)
591 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
596 /* Translates the format spec from sysfile format to internal
599 parse_format_spec (struct sfm_reader *r, uint32_t s,
600 enum which_format which, struct variable *v,
601 int *format_warning_cnt)
603 const int max_format_warnings = 8;
605 uint8_t raw_type = s >> 16;
611 if (!fmt_from_io (raw_type, &f.type))
612 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
617 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
622 if (which == PRINT_FORMAT)
623 var_set_print_format (v, &f);
625 var_set_write_format (v, &f);
627 else if (*++format_warning_cnt <= max_format_warnings)
629 char fmt_string[FMT_STRING_LEN_MAX + 1];
630 sys_warn (r, _("%s variable %s has invalid %s format %s."),
631 var_is_numeric (v) ? _("Numeric") : _("String"),
633 which == PRINT_FORMAT ? _("print") : _("write"),
634 fmt_to_string (&f, fmt_string));
636 if (*format_warning_cnt == max_format_warnings)
637 sys_warn (r, _("Suppressing further invalid format warnings."));
641 /* Sets the weighting variable in DICT to the variable
642 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
645 setup_weight (struct sfm_reader *r, int weight_idx,
646 struct variable **var_by_value_idx, struct dictionary *dict)
650 struct variable *weight_var
651 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
652 if (var_is_numeric (weight_var))
653 dict_set_weight (dict, weight_var);
655 sys_error (r, _("Weighting variable must be numeric."));
659 /* Reads a document record, type 6, from system file R, and sets up
660 the documents and n_documents fields in the associated
663 read_documents (struct sfm_reader *r, struct dictionary *dict)
668 if (dict_get_documents (dict) != NULL)
669 sys_error (r, _("Multiple type 6 (document) records."));
671 line_cnt = read_int32 (r);
673 sys_error (r, _("Number of document lines (%d) "
674 "must be greater than 0."), line_cnt);
676 documents = pool_nmalloc (r->pool, line_cnt + 1, 80);
677 read_string (r, documents, 80 * line_cnt + 1);
678 dict_set_documents (dict, documents);
679 pool_free (r->pool, documents);
682 /* Read a type 7 extension record. */
684 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
686 int subtype = read_int32 (r);
687 size_t size = read_int32 (r);
688 size_t count = read_int32 (r);
689 size_t bytes = size * count;
691 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
692 allows an extra byte for a null terminator, used by some
693 extension processing routines. */
694 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
695 sys_error (r, "Record type 7 subtype %d too large.", subtype);
700 read_machine_int32_info (r, size, count);
704 read_machine_flt64_info (r, size, count);
708 /* Variable sets information. We don't use these yet.
709 They only apply to GUIs; see VARSETS on the APPLY
710 DICTIONARY command in SPSS documentation. */
714 /* DATE variable information. We don't use it yet, but we
719 /* Unknown purpose. */
723 read_display_parameters (r, size, count, dict);
727 read_long_var_name_map (r, size, count, dict);
731 read_long_string_map (r, size, count, dict);
735 /* New in SPSS v14? Unknown purpose. */
739 /* Text field that defines variable attributes. New in
744 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
748 skip_bytes (r, bytes);
751 /* Read record type 7, subtype 3. */
753 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
755 int version_major UNUSED = read_int32 (r);
756 int version_minor UNUSED = read_int32 (r);
757 int version_revision UNUSED = read_int32 (r);
758 int machine_code UNUSED = read_int32 (r);
759 int float_representation = read_int32 (r);
760 int compression_code UNUSED = read_int32 (r);
761 int integer_representation = read_int32 (r);
762 int character_code UNUSED = read_int32 (r);
764 int expected_float_format;
765 int expected_integer_format;
767 if (size != 4 || count != 8)
768 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
770 (unsigned int) size, (unsigned int) count);
772 /* Check floating point format. */
773 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
774 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
775 expected_float_format = 1;
776 else if (r->float_format == FLOAT_Z_LONG)
777 expected_float_format = 2;
778 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
779 expected_float_format = 3;
782 if (float_representation != expected_float_format)
783 sys_error (r, _("Floating-point representation indicated by "
784 "system file (%d) differs from expected (%d)."),
785 r->float_format, expected_float_format);
787 /* Check integer format. */
788 if (r->integer_format == INTEGER_MSB_FIRST)
789 expected_integer_format = 1;
790 else if (r->integer_format == INTEGER_LSB_FIRST)
791 expected_integer_format = 2;
794 if (integer_representation != expected_integer_format)
796 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
797 sys_warn (r, _("Integer format indicated by system file (%s) "
798 "differs from expected (%s)."),
799 gettext (endian[integer_representation == 1]),
800 gettext (endian[expected_integer_format == 1]));
804 /* Read record type 7, subtype 4. */
806 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
808 double sysmis = read_flt64 (r);
809 double highest = read_flt64 (r);
810 double lowest = read_flt64 (r);
812 if (size != 8 || count != 3)
813 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
814 (unsigned int) size, (unsigned int) count);
816 if (sysmis != SYSMIS)
817 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
818 if (highest != HIGHEST)
819 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
820 if (lowest != LOWEST)
821 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
824 /* Read record type 7, subtype 11, which specifies how variables
825 should be displayed in GUI environments. */
827 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
828 struct dictionary *dict)
830 const size_t n_vars = count / 3 ;
834 if (count % 3 || n_vars != dict_get_var_cnt (dict))
835 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
836 (unsigned int) size, (unsigned int) count);
838 for (i = 0; i < n_vars; ++i)
840 int measure = read_int32 (r);
841 int width = read_int32 (r);
842 int align = read_int32 (r);
843 struct variable *v = dict_get_var (dict, i);
845 /* spss v14 sometimes seems to set string variables' measure to zero */
846 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
849 if (measure < 1 || measure > 3 || align < 0 || align > 2)
852 sys_warn (r, _("Invalid variable display parameters. "
853 "Default parameters substituted."));
858 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
859 : measure == 2 ? MEASURE_ORDINAL
861 var_set_display_width (v, width);
862 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
863 : align == 1 ? ALIGN_RIGHT
868 /* Reads record type 7, subtype 13, which gives the long name
869 that corresponds to each short name. Modifies variable names
870 in DICT accordingly. */
872 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
873 struct dictionary *dict)
875 struct variable_to_value_map *map;
876 struct variable *var;
880 map = open_variable_to_value_map (r, size * count);
881 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
884 char short_name[SHORT_NAME_LEN + 1];
885 strcpy (short_name, var_get_short_name (var));
887 /* Validate long name. */
888 if (!var_is_valid_name (long_name, false))
890 sys_warn (r, _("Long variable mapping from %s to invalid "
891 "variable name `%s'."),
892 var_get_name (var), long_name);
896 /* Identify any duplicates. */
897 if (strcasecmp (short_name, long_name)
898 && dict_lookup_var (dict, long_name) != NULL)
900 sys_warn (r, _("Duplicate long variable name `%s' "
901 "within system file."), long_name);
905 /* Set long name. Renaming a variable may clear the short
906 name, but we want to retain it, so re-set it
908 dict_rename_var (dict, var, long_name);
909 var_set_short_name (var, short_name);
911 close_variable_to_value_map (r, map);
912 r->has_long_var_names = true;
915 /* Reads record type 7, subtype 14, which gives the real length
916 of each very long string. Rearranges DICT accordingly. */
918 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
919 struct dictionary *dict)
921 struct variable_to_value_map *map;
922 struct variable *var;
928 map = open_variable_to_value_map (r, size * count);
929 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
932 long length, remaining_length;
936 length = strtol (length_s, NULL, 10);
937 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
939 sys_warn (r, _("%s listed as string of length %s "
941 var_get_name (var), length_s);
945 /* Group multiple variables into single variable
946 and delete all but the first. */
947 remaining_length = length;
948 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
949 if (idx < dict_get_var_cnt (dict))
950 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
951 EFFECTIVE_LONG_STRING_LENGTH);
953 sys_error (r, _("Very long string %s overflows dictionary."),
955 dict_delete_consecutive_vars (dict,
956 var_get_dict_index (var) + 1,
957 idx - var_get_dict_index (var) - 1);
959 /* Assign all the length to the first variable. */
960 var_set_width (var, length);
962 close_variable_to_value_map (r, map);
963 dict_compact_values (dict);
966 /* Reads value labels from sysfile H and inserts them into the
967 associated dictionary. */
969 read_value_labels (struct sfm_reader *r,
970 struct dictionary *dict, struct variable **var_by_value_idx)
972 struct pool *subpool;
976 char raw_value[8]; /* Value as uninterpreted bytes. */
977 union value value; /* Value. */
978 char *label; /* Null-terminated label string. */
981 struct label *labels = NULL;
982 int label_cnt; /* Number of labels. */
984 struct variable **var = NULL; /* Associated variables. */
985 int var_cnt; /* Number of associated variables. */
989 subpool = pool_create_subpool (r->pool);
991 /* Read the type 3 record and record its contents. We can't do
992 much with the data yet because we don't know whether it is
993 of numeric or string type. */
995 /* Read number of labels. */
996 label_cnt = read_int32 (r);
998 if (label_cnt >= INT32_MAX / sizeof *labels)
1000 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1005 /* Read each value/label tuple into labels[]. */
1006 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1007 for (i = 0; i < label_cnt; i++)
1009 struct label *label = labels + i;
1010 unsigned char label_len;
1014 read_bytes (r, label->raw_value, sizeof label->raw_value);
1016 /* Read label length. */
1017 read_bytes (r, &label_len, sizeof label_len);
1018 padded_len = ROUND_UP (label_len + 1, 8);
1020 /* Read label, padding. */
1021 label->label = pool_alloc (subpool, padded_len + 1);
1022 read_bytes (r, label->label, padded_len - 1);
1023 label->label[label_len] = 0;
1026 /* Now, read the type 4 record that has the list of variables
1027 to which the value labels are to be applied. */
1029 /* Read record type of type 4 record. */
1030 if (read_int32 (r) != 4)
1031 sys_error (r, _("Variable index record (type 4) does not immediately "
1032 "follow value label record (type 3) as it should."));
1034 /* Read number of variables associated with value label from type 4
1036 var_cnt = read_int32 (r);
1037 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1038 sys_error (r, _("Number of variables associated with a value label (%d) "
1039 "is not between 1 and the number of variables (%u)."),
1040 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1042 /* Read the list of variables. */
1043 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1044 for (i = 0; i < var_cnt; i++)
1046 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1047 if (var_is_long_string (var[i]))
1048 sys_error (r, _("Value labels are not allowed on long string "
1049 "variables (%s)."), var_get_name (var[i]));
1052 /* Type check the variables. */
1053 for (i = 1; i < var_cnt; i++)
1054 if (var_get_type (var[i]) != var_get_type (var[0]))
1055 sys_error (r, _("Variables associated with value label are not all of "
1056 "identical type. Variable %s is %s, but variable "
1058 var_get_name (var[0]),
1059 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1060 var_get_name (var[i]),
1061 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1063 /* Fill in labels[].value, now that we know the desired type. */
1064 for (i = 0; i < label_cnt; i++)
1066 struct label *label = labels + i;
1068 if (var_is_alpha (var[0]))
1069 buf_copy_rpad (label->value.s, sizeof label->value.s,
1070 label->raw_value, sizeof label->raw_value);
1072 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1075 /* Assign the `value_label's to each variable. */
1076 for (i = 0; i < var_cnt; i++)
1078 struct variable *v = var[i];
1081 /* Add each label to the variable. */
1082 for (j = 0; j < label_cnt; j++)
1084 struct label *label = &labels[j];
1085 if (!var_add_value_label (v, &label->value, label->label))
1087 if (var_is_numeric (var[0]))
1088 sys_warn (r, _("Duplicate value label for %g on %s."),
1089 label->value.f, var_get_name (v));
1091 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1092 var_get_width (v), label->value.s,
1098 pool_destroy (subpool);
1103 static void partial_record (struct sfm_reader *r)
1105 static bool read_case_number (struct sfm_reader *, double *);
1106 static bool read_case_string (struct sfm_reader *, char *, size_t);
1107 static int read_opcode (struct sfm_reader *);
1108 static bool read_compressed_number (struct sfm_reader *, double *);
1109 static bool read_compressed_string (struct sfm_reader *, char *);
1110 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1112 /* Reads one case from READER's file into C. Returns nonzero
1113 only if successful. */
1115 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1120 if (setjmp (r->bail_out))
1123 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1125 /* Fast path. Read the whole case directly. */
1126 if (!try_read_bytes (r, case_data_all_rw (c),
1127 sizeof (union value) * r->value_cnt))
1130 /* Convert floating point numbers to native format if needed. */
1131 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1135 for (i = 0; i < r->var_cnt; i++)
1136 if (r->vars[i].width == 0)
1138 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1139 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1146 /* Slow path. Convert from external to internal format. */
1149 for (i = 0; i < r->var_cnt; i++)
1151 struct sfm_var *sv = &r->vars[i];
1152 union value *v = case_data_rw_idx (c, sv->case_index);
1156 if (!read_case_number (r, &v->f))
1161 /* Read the string data in segments up to 255 bytes
1162 at a time, packed into 8-byte units. */
1163 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1164 int ofs, chunk_size;
1165 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1167 chunk_size = MIN (max_chunk, sv->width - ofs);
1168 if (!read_case_string (r, v->s + ofs, chunk_size))
1176 /* Very long strings have trailing wasted space
1177 that we must skip. */
1178 if (sv->width >= MIN_VERY_LONG_STRING)
1180 int bytes_read = (sv->width / max_chunk * 256
1181 + ROUND_UP (sv->width % max_chunk, 8));
1182 int total_bytes = sfm_width_to_bytes (sv->width);
1183 int excess_bytes = total_bytes - bytes_read;
1185 while (excess_bytes > 0)
1188 size_t chunk = MIN (sizeof buffer, excess_bytes);
1189 if (!read_whole_strings (r, buffer, chunk))
1191 excess_bytes -= chunk;
1205 /* Issues an error that R ends in a partial record. */
1207 partial_record (struct sfm_reader *r)
1209 sys_error (r, _("File ends in partial case."));
1212 /* Reads a number from R and stores its value in *D.
1213 If R is compressed, reads a compressed number;
1214 otherwise, reads a number in the regular way.
1215 Returns true if successful, false if end of file is
1216 reached immediately. */
1218 read_case_number (struct sfm_reader *r, double *d)
1223 if (!try_read_bytes (r, flt64, sizeof flt64))
1225 *d = flt64_to_double (r, flt64);
1229 return read_compressed_number (r, d);
1232 /* Reads LENGTH string bytes from R into S.
1233 Always reads a multiple of 8 bytes; if LENGTH is not a
1234 multiple of 8, then extra bytes are read and discarded without
1236 Reads compressed strings if S is compressed.
1237 Returns true if successful, false if end of file is
1238 reached immediately. */
1240 read_case_string (struct sfm_reader *r, char *s, size_t length)
1242 size_t whole = ROUND_DOWN (length, 8);
1243 size_t partial = length % 8;
1247 if (!read_whole_strings (r, s, whole))
1254 if (!read_whole_strings (r, bounce, sizeof bounce))
1260 memcpy (s + whole, bounce, partial);
1266 /* Reads and returns the next compression opcode from R. */
1268 read_opcode (struct sfm_reader *r)
1270 assert (r->compressed);
1274 if (r->opcode_idx >= sizeof r->opcodes)
1276 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1280 opcode = r->opcodes[r->opcode_idx++];
1287 /* Reads a compressed number from R and stores its value in D.
1288 Returns true if successful, false if end of file is
1289 reached immediately. */
1291 read_compressed_number (struct sfm_reader *r, double *d)
1293 int opcode = read_opcode (r);
1301 *d = read_flt64 (r);
1305 sys_error (r, _("Compressed data is corrupt."));
1312 *d = opcode - r->bias;
1319 /* Reads a compressed 8-byte string segment from R and stores it
1321 Returns true if successful, false if end of file is
1322 reached immediately. */
1324 read_compressed_string (struct sfm_reader *r, char *dst)
1326 switch (read_opcode (r))
1333 read_bytes (r, dst, 8);
1337 memset (dst, ' ', 8);
1341 sys_error (r, _("Compressed data is corrupt."));
1347 /* Reads LENGTH string bytes from R into S.
1348 LENGTH must be a multiple of 8.
1349 Reads compressed strings if S is compressed.
1350 Returns true if successful, false if end of file is
1351 reached immediately. */
1353 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1355 assert (length % 8 == 0);
1357 return try_read_bytes (r, s, length);
1361 for (ofs = 0; ofs < length; ofs += 8)
1362 if (!read_compressed_string (r, s + ofs))
1372 /* Creates and returns a table that can be used for translating a value
1373 index into a case to a "struct variable *" for DICT. Multiple
1374 system file fields reference variables this way.
1376 This table must be created before processing the very long
1377 string extension record, because that record causes some
1378 values to be deleted from the case and the dictionary to be
1380 static struct variable **
1381 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1383 struct variable **var_by_value_idx;
1387 var_by_value_idx = pool_nmalloc (r->pool,
1388 r->value_cnt, sizeof *var_by_value_idx);
1389 for (i = 0; i < dict_get_var_cnt (dict); i++)
1391 struct variable *v = dict_get_var (dict, i);
1392 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1395 var_by_value_idx[value_idx++] = v;
1396 for (j = 1; j < nv; j++)
1397 var_by_value_idx[value_idx++] = NULL;
1399 assert (value_idx == r->value_cnt);
1401 return var_by_value_idx;
1404 /* Returns the "struct variable" corresponding to the given
1405 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1407 static struct variable *
1408 lookup_var_by_value_idx (struct sfm_reader *r,
1409 struct variable **var_by_value_idx, int value_idx)
1411 struct variable *var;
1413 if (value_idx < 1 || value_idx > r->value_cnt)
1414 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1415 value_idx, r->value_cnt);
1417 var = var_by_value_idx[value_idx - 1];
1419 sys_error (r, _("Variable index %d refers to long string "
1426 /* Returns the variable in D with the given SHORT_NAME,
1427 or a null pointer if there is none. */
1428 static struct variable *
1429 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1431 struct variable *var;
1435 /* First try looking up by full name. This often succeeds. */
1436 var = dict_lookup_var (d, short_name);
1437 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1440 /* Iterate through the whole dictionary as a fallback. */
1441 var_cnt = dict_get_var_cnt (d);
1442 for (i = 0; i < var_cnt; i++)
1444 var = dict_get_var (d, i);
1445 if (!strcasecmp (var_get_short_name (var), short_name))
1452 /* Helpers for reading records that contain "variable=value"
1456 struct variable_to_value_map
1458 struct substring buffer; /* Record contents. */
1459 size_t pos; /* Current position in buffer. */
1462 /* Reads SIZE bytes into a "variable=value" map for R,
1463 and returns the map. */
1464 static struct variable_to_value_map *
1465 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1467 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1468 char *buffer = pool_malloc (r->pool, size + 1);
1469 read_bytes (r, buffer, size);
1470 map->buffer = ss_buffer (buffer, size);
1475 /* Closes MAP and frees its storage.
1476 Not really needed, because the pool will free the map anyway,
1477 but can be used to free it earlier. */
1479 close_variable_to_value_map (struct sfm_reader *r,
1480 struct variable_to_value_map *map)
1482 pool_free (r->pool, ss_data (map->buffer));
1485 /* Reads the next variable=value pair from MAP.
1486 Looks up the variable in DICT and stores it into *VAR.
1487 Stores a null-terminated value into *VALUE. */
1489 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1490 struct variable_to_value_map *map,
1491 struct variable **var, char **value,
1494 int max_warnings = 5;
1498 struct substring short_name_ss, value_ss;
1500 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1501 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1504 if (*warning_cnt > max_warnings)
1505 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1506 *warning_cnt - max_warnings);
1510 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1511 ss_buffer ("\t\0", 2));
1513 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1514 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1517 if (++*warning_cnt <= 5)
1518 sys_warn (r, _("Variable map refers to unknown variable %s."),
1519 ss_data (short_name_ss));
1523 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1524 *value = ss_data (value_ss);
1532 /* Displays a corruption message. */
1534 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1539 ds_init_empty (&text);
1540 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1541 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1542 ds_put_vformat (&text, format, args);
1544 m.category = msg_class_to_category (class);
1545 m.severity = msg_class_to_severity (class);
1546 m.where.file_name = NULL;
1547 m.where.line_number = 0;
1548 m.text = ds_cstr (&text);
1553 /* Displays a warning for the current file position. */
1555 sys_warn (struct sfm_reader *r, const char *format, ...)
1559 va_start (args, format);
1560 sys_msg (r, MW, format, args);
1564 /* Displays an error for the current file position,
1565 marks it as in an error state,
1566 and aborts reading it using longjmp. */
1568 sys_error (struct sfm_reader *r, const char *format, ...)
1572 va_start (args, format);
1573 sys_msg (r, ME, format, args);
1577 longjmp (r->bail_out, 1);
1580 /* Reads BYTE_CNT bytes into BUF.
1581 Returns true if exactly BYTE_CNT bytes are successfully read.
1582 Aborts if an I/O error or a partial read occurs.
1583 If EOF_IS_OK, then an immediate end-of-file causes false to be
1584 returned; otherwise, immediate end-of-file causes an abort
1587 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1588 void *buf, size_t byte_cnt)
1590 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1591 if (bytes_read == byte_cnt)
1593 else if (ferror (r->file))
1594 sys_error (r, _("System error: %s."), strerror (errno));
1595 else if (!eof_is_ok || bytes_read != 0)
1596 sys_error (r, _("Unexpected end of file."));
1601 /* Reads BYTE_CNT into BUF.
1602 Aborts upon I/O error or if end-of-file is encountered. */
1604 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1606 read_bytes_internal (r, false, buf, byte_cnt);
1609 /* Reads BYTE_CNT bytes into BUF.
1610 Returns true if exactly BYTE_CNT bytes are successfully read.
1611 Returns false if an immediate end-of-file is encountered.
1612 Aborts if an I/O error or a partial read occurs. */
1614 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1616 return read_bytes_internal (r, true, buf, byte_cnt);
1619 /* Reads a 32-bit signed integer from R and returns its value in
1622 read_int32 (struct sfm_reader *r)
1625 read_bytes (r, int32, sizeof int32);
1626 return int32_to_native (r, int32);
1629 /* Reads a 64-bit floating-point number from R and returns its
1630 value in host format. */
1632 read_flt64 (struct sfm_reader *r)
1635 read_bytes (r, flt64, sizeof flt64);
1636 return flt64_to_double (r, flt64);
1639 /* Reads exactly SIZE - 1 bytes into BUFFER
1640 and stores a null byte into BUFFER[SIZE - 1]. */
1642 read_string (struct sfm_reader *r, char *buffer, size_t size)
1645 read_bytes (r, buffer, size - 1);
1646 buffer[size - 1] = '\0';
1649 /* Skips BYTES bytes forward in R. */
1651 skip_bytes (struct sfm_reader *r, size_t bytes)
1656 size_t chunk = MIN (sizeof buffer, bytes);
1657 read_bytes (r, buffer, chunk);
1662 /* Returns the value of the 32-bit signed integer at INT32,
1663 converted from the format used by R to the host format. */
1665 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1668 if (r->integer_format == INTEGER_NATIVE)
1669 memcpy (&x, int32, sizeof x);
1671 x = integer_get (r->integer_format, int32, sizeof x);
1675 /* Returns the value of the 64-bit floating point number at
1676 FLT64, converted from the format used by R to the host
1679 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1682 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1683 memcpy (&x, flt64, sizeof x);
1685 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);