1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include "sys-file-reader.h"
22 #include "sys-file-private.h"
30 #include <libpspp/alloc.h>
31 #include <libpspp/assertion.h>
32 #include <libpspp/message.h>
33 #include <libpspp/compiler.h>
34 #include <libpspp/magic.h>
35 #include <libpspp/misc.h>
36 #include <libpspp/pool.h>
37 #include <libpspp/str.h>
38 #include <libpspp/hash.h>
39 #include <libpspp/array.h>
42 #include "dictionary.h"
43 #include "file-handle-def.h"
44 #include "file-name.h"
46 #include "missing-values.h"
47 #include "value-labels.h"
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
74 enum integer_format integer_format; /* On-disk integer format. */
75 enum float_format float_format; /* On-disk floating point format. */
76 int value_cnt; /* Number of 8-byte units per case. */
77 struct sfm_var *vars; /* Variables. */
78 size_t var_cnt; /* Number of variables. */
79 bool has_long_var_names; /* File has a long variable name map */
80 bool has_vls; /* File has one or more very long strings? */
83 bool compressed; /* File is compressed? */
84 double bias; /* Compression bias, usually 100.0. */
85 uint8_t opcodes[8]; /* Current block of opcodes. */
86 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
89 /* A variable in a system file. */
92 int width; /* 0=numeric, otherwise string width. */
93 int case_index; /* Index into case. */
96 static struct variable **make_var_by_value_idx (struct sfm_reader *,
98 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
102 static void sys_warn (struct sfm_reader *, const char *, ...)
103 PRINTF_FORMAT (2, 3);
105 static void sys_error (struct sfm_reader *, const char *, ...)
109 static void read_bytes (struct sfm_reader *, void *, size_t);
110 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
111 static int32_t read_int32 (struct sfm_reader *);
112 static double read_flt64 (struct sfm_reader *);
113 static void read_string (struct sfm_reader *, char *, size_t);
114 static void skip_bytes (struct sfm_reader *, size_t);
116 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
117 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
119 static struct variable_to_value_map *open_variable_to_value_map (
120 struct sfm_reader *, size_t size);
121 static void close_variable_to_value_map (struct sfm_reader *r,
122 struct variable_to_value_map *);
123 static bool read_variable_to_value_map (struct sfm_reader *,
125 struct variable_to_value_map *,
126 struct variable **var, char **value,
129 /* Dictionary reader. */
137 static void read_header (struct sfm_reader *, struct dictionary *,
138 int *weight_idx, int *claimed_value_cnt,
139 struct sfm_read_info *);
140 static void read_variable_record (struct sfm_reader *, struct dictionary *,
141 int *format_warning_cnt);
142 static void parse_format_spec (struct sfm_reader *, uint32_t,
143 enum which_format, struct variable *,
144 int *format_warning_cnt);
145 static void setup_weight (struct sfm_reader *, int weight_idx,
146 struct variable **var_by_value_idx,
147 struct dictionary *);
148 static void read_documents (struct sfm_reader *, struct dictionary *);
149 static void read_value_labels (struct sfm_reader *, struct dictionary *,
150 struct variable **var_by_value_idx);
152 static void read_extension_record (struct sfm_reader *, struct dictionary *);
153 static void read_machine_int32_info (struct sfm_reader *,
154 size_t size, size_t count);
155 static void read_machine_flt64_info (struct sfm_reader *,
156 size_t size, size_t count);
157 static void read_display_parameters (struct sfm_reader *,
158 size_t size, size_t count,
159 struct dictionary *);
160 static void read_long_var_name_map (struct sfm_reader *,
161 size_t size, size_t count,
162 struct dictionary *);
163 static void read_long_string_map (struct sfm_reader *,
164 size_t size, size_t count,
165 struct dictionary *);
168 /* Opens the system file designated by file handle FH for
169 reading. Reads the system file's dictionary into *DICT.
170 If INFO is non-null, then it receives additional info about the
173 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
174 struct sfm_read_info *info)
176 struct sfm_reader *volatile r = NULL;
177 struct variable **var_by_value_idx;
178 int format_warning_cnt = 0;
180 int claimed_value_cnt;
184 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
187 *dict = dict_create ();
189 /* Create and initialize reader. */
190 r = pool_create_container (struct sfm_reader, pool);
192 r->file = fn_open (fh_get_file_name (fh), "rb");
196 r->has_long_var_names = false;
197 r->opcode_idx = sizeof r->opcodes;
199 if (setjmp (r->bail_out))
201 sfm_close_reader (r);
202 dict_destroy (*dict);
209 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
210 fh_get_file_name (r->fh), strerror (errno));
211 longjmp (r->bail_out, 1);
215 read_header (r, *dict, &weight_idx, &claimed_value_cnt, info);
217 /* Read all the variable definition records. */
218 rec_type = read_int32 (r);
219 while (rec_type == 2)
221 read_variable_record (r, *dict, &format_warning_cnt);
222 rec_type = read_int32 (r);
225 /* Figure out the case format. */
226 var_by_value_idx = make_var_by_value_idx (r, *dict);
227 setup_weight (r, weight_idx, var_by_value_idx, *dict);
229 /* Read all the rest of the dictionary records. */
230 while (rec_type != 999)
235 read_value_labels (r, *dict, var_by_value_idx);
239 sys_error (r, _("Misplaced type 4 record."));
242 read_documents (r, *dict);
246 read_extension_record (r, *dict);
250 sys_error (r, _("Unrecognized record type %d."), rec_type);
252 rec_type = read_int32 (r);
256 if ( ! r->has_long_var_names )
259 for (i = 0; i < dict_get_var_cnt (*dict); i++)
261 struct variable *var = dict_get_var (*dict, i);
262 char short_name [SHORT_NAME_LEN + 1];
263 char long_name [SHORT_NAME_LEN + 1];
265 strcpy (short_name, var_get_name (var));
267 strcpy (long_name, short_name);
268 str_lowercase (long_name);
270 /* Set long name. Renaming a variable may clear the short
271 name, but we want to retain it, so re-set it
273 dict_rename_var (*dict, var, long_name);
274 var_set_short_name (var, short_name);
277 r->has_long_var_names = true;
280 /* Read record 999 data, which is just filler. */
283 if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt)
284 sys_warn (r, _("File header claims %d variable positions but "
285 "%d were read from file."),
286 claimed_value_cnt, r->value_cnt);
288 /* Create an index of dictionary variable widths for
289 sfm_read_case to use. We cannot use the `struct variable's
290 from the dictionary we created, because the caller owns the
291 dictionary and may destroy or modify its variables. */
292 r->var_cnt = dict_get_var_cnt (*dict);
293 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
294 for (i = 0; i < r->var_cnt; i++)
296 struct variable *v = dict_get_var (*dict, i);
297 struct sfm_var *sv = &r->vars[i];
298 sv->width = var_get_width (v);
299 sv->case_index = var_get_case_index (v);
302 pool_free (r->pool, var_by_value_idx);
306 /* Closes a system file after we're done with it. */
308 sfm_close_reader (struct sfm_reader *r)
315 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
316 msg (ME, _("Error closing system file \"%s\": %s."),
317 fh_get_file_name (r->fh), strerror (errno));
322 fh_close (r->fh, "system file", "rs");
324 pool_destroy (r->pool);
327 /* Returns true if an I/O error has occurred on READER, false
330 sfm_read_error (const struct sfm_reader *reader)
332 return reader->error;
335 /* Returns true if FILE is an SPSS system file,
338 sfm_detect (FILE *file)
342 if (fread (rec_type, 4, 1, file) != 1)
346 return !strcmp ("$FL2", rec_type);
349 /* Reads the global header of the system file.
350 Sets DICT's file label to the system file's label.
351 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
352 or to the value index of the weight variable otherwise.
353 Sets *CLAIMED_VALUE_CNT to the number of values that the file
354 claims to have (although it is not always correct).
355 If INFO is non-null, initializes *INFO with header
358 read_header (struct sfm_reader *r, struct dictionary *dict,
359 int *weight_idx, int *claimed_value_cnt,
360 struct sfm_read_info *info)
363 char eye_catcher[61];
364 uint8_t raw_layout_code[4];
367 char creation_date[10];
368 char creation_time[9];
370 struct substring file_label_ss;
372 read_string (r, rec_type, sizeof rec_type);
373 read_string (r, eye_catcher, sizeof eye_catcher);
375 if (strcmp ("$FL2", rec_type) != 0)
376 sys_error (r, _("This is not an SPSS system file."));
378 /* Identify integer format. */
379 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
380 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
382 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
384 || (r->integer_format != INTEGER_MSB_FIRST
385 && r->integer_format != INTEGER_LSB_FIRST))
386 sys_error (r, _("This is not an SPSS system file."));
388 *claimed_value_cnt = read_int32 (r);
389 if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16)
390 *claimed_value_cnt = -1;
392 r->compressed = read_int32 (r) != 0;
394 *weight_idx = read_int32 (r);
396 case_cnt = read_int32 (r);
397 if (case_cnt < -1 || case_cnt > INT_MAX / 2)
400 /* Identify floating-point format and obtain compression bias. */
401 read_bytes (r, raw_bias, sizeof raw_bias);
402 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
404 sys_warn (r, _("Compression bias (%g) is not the usual "
405 "value of 100, or system file uses unrecognized "
406 "floating-point format."),
408 if (r->integer_format == INTEGER_MSB_FIRST)
409 r->float_format = FLOAT_IEEE_DOUBLE_BE;
411 r->float_format = FLOAT_IEEE_DOUBLE_LE;
413 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
415 read_string (r, creation_date, sizeof creation_date);
416 read_string (r, creation_time, sizeof creation_time);
417 read_string (r, file_label, sizeof file_label);
420 file_label_ss = ss_cstr (file_label);
421 ss_trim (&file_label_ss, ss_cstr (" "));
422 if (!ss_is_empty (file_label_ss))
424 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
425 dict_set_label (dict, ss_data (file_label_ss));
430 struct substring product;
432 strcpy (info->creation_date, creation_date);
433 strcpy (info->creation_time, creation_time);
434 info->integer_format = r->integer_format;
435 info->float_format = r->float_format;
436 info->compressed = r->compressed;
437 info->case_cnt = case_cnt;
439 product = ss_cstr (eye_catcher);
440 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
441 ss_trim (&product, ss_cstr (" "));
442 str_copy_buf_trunc (info->product, sizeof info->product,
443 ss_data (product), ss_length (product));
447 /* Reads a variable (type 2) record from R and adds the
448 corresponding variable to DICT.
449 Also skips past additional variable records for long string
452 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
453 int *format_warning_cnt)
456 int has_variable_label;
457 int missing_value_code;
462 struct variable *var;
465 width = read_int32 (r);
466 has_variable_label = read_int32 (r);
467 missing_value_code = read_int32 (r);
468 print_format = read_int32 (r);
469 write_format = read_int32 (r);
470 read_string (r, name, sizeof name);
471 name[strcspn (name, " ")] = '\0';
473 /* Check variable name. */
474 if (name[0] == '$' || name[0] == '#')
475 sys_error (r, "Variable name begins with invalid character `%c'.",
477 if (!var_is_plausible_name (name, false))
478 sys_error (r, _("Invalid variable name `%s'."), name);
480 /* Create variable. */
481 if (width < 0 || width > 255)
482 sys_error (r, _("Bad variable width %d."), width);
483 var = dict_create_var (dict, name, width);
486 _("Duplicate variable name `%s' within system file."),
489 /* Set the short name the same as the long name */
490 var_set_short_name (var, var_get_name (var));
492 /* Get variable label, if any. */
493 if (has_variable_label != 0 && has_variable_label != 1)
494 sys_error (r, _("Variable label indicator field is not 0 or 1."));
495 if (has_variable_label == 1)
500 len = read_int32 (r);
501 if (len >= sizeof label)
502 sys_error (r, _("Variable %s has label of invalid length %u."),
503 name, (unsigned int) len);
504 read_string (r, label, len + 1);
505 var_set_label (var, label);
507 skip_bytes (r, ROUND_UP (len, 4) - len);
510 /* Set missing values. */
511 if (missing_value_code < -3 || missing_value_code > 3
512 || missing_value_code == -1)
513 sys_error (r, _("Missing value indicator field is not "
514 "-3, -2, 0, 1, 2, or 3."));
515 if (missing_value_code != 0)
517 struct missing_values mv;
518 mv_init (&mv, var_get_width (var));
519 if (var_is_numeric (var))
521 if (missing_value_code > 0)
524 for (i = 0; i < missing_value_code; i++)
525 mv_add_num (&mv, read_flt64 (r));
529 double low = read_flt64 (r);
530 double high = read_flt64 (r);
531 mv_add_num_range (&mv, low, high);
532 if (missing_value_code == -3)
533 mv_add_num (&mv, read_flt64 (r));
536 else if (var_get_width (var) <= MAX_SHORT_STRING)
538 if (missing_value_code > 0)
541 for (i = 0; i < missing_value_code; i++)
544 read_string (r, string, sizeof string);
545 mv_add_str (&mv, string);
549 sys_error (r, _("String variable %s may not have missing "
550 "values specified as a range."),
553 else /* var->width > MAX_SHORT_STRING */
554 sys_error (r, _("Long string variable %s may not have missing "
557 var_set_missing_values (var, &mv);
561 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
562 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
564 /* Account for values.
565 Skip long string continuation records, if any. */
566 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
572 for (i = 1; i < nv; i++)
574 /* Check for record type 2 and width -1. */
575 if (read_int32 (r) != 2 || read_int32 (r) != -1)
576 sys_error (r, _("Missing string continuation record."));
578 /* Skip and ignore remaining continuation data. */
579 has_variable_label = read_int32 (r);
580 missing_value_code = read_int32 (r);
581 print_format = read_int32 (r);
582 write_format = read_int32 (r);
583 read_string (r, name, sizeof name);
585 /* Variable label fields on continuation records have
586 been spotted in system files created by "SPSS Power
587 Macintosh Release 6.1". */
588 if (has_variable_label)
589 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
594 /* Translates the format spec from sysfile format to internal
597 parse_format_spec (struct sfm_reader *r, uint32_t s,
598 enum which_format which, struct variable *v,
599 int *format_warning_cnt)
601 const int max_format_warnings = 8;
603 uint8_t raw_type = s >> 16;
609 if (!fmt_from_io (raw_type, &f.type))
610 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
615 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
620 if (which == PRINT_FORMAT)
621 var_set_print_format (v, &f);
623 var_set_write_format (v, &f);
625 else if (*++format_warning_cnt <= max_format_warnings)
627 char fmt_string[FMT_STRING_LEN_MAX + 1];
628 sys_warn (r, _("%s variable %s has invalid %s format %s."),
629 var_is_numeric (v) ? _("Numeric") : _("String"),
631 which == PRINT_FORMAT ? _("print") : _("write"),
632 fmt_to_string (&f, fmt_string));
634 if (*format_warning_cnt == max_format_warnings)
635 sys_warn (r, _("Suppressing further invalid format warnings."));
639 /* Sets the weighting variable in DICT to the variable
640 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
643 setup_weight (struct sfm_reader *r, int weight_idx,
644 struct variable **var_by_value_idx, struct dictionary *dict)
648 struct variable *weight_var
649 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
650 if (var_is_numeric (weight_var))
651 dict_set_weight (dict, weight_var);
653 sys_error (r, _("Weighting variable must be numeric."));
657 /* Reads a document record, type 6, from system file R, and sets up
658 the documents and n_documents fields in the associated
661 read_documents (struct sfm_reader *r, struct dictionary *dict)
666 if (dict_get_documents (dict) != NULL)
667 sys_error (r, _("Multiple type 6 (document) records."));
669 line_cnt = read_int32 (r);
671 sys_error (r, _("Number of document lines (%d) "
672 "must be greater than 0."), line_cnt);
674 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
675 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
676 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
677 dict_set_documents (dict, documents);
679 sys_error (r, _("Document line contains null byte."));
680 pool_free (r->pool, documents);
683 /* Read a type 7 extension record. */
685 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
687 int subtype = read_int32 (r);
688 size_t size = read_int32 (r);
689 size_t count = read_int32 (r);
690 size_t bytes = size * count;
692 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
693 allows an extra byte for a null terminator, used by some
694 extension processing routines. */
695 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
696 sys_error (r, "Record type 7 subtype %d too large.", subtype);
701 read_machine_int32_info (r, size, count);
705 read_machine_flt64_info (r, size, count);
709 /* Variable sets information. We don't use these yet.
710 They only apply to GUIs; see VARSETS on the APPLY
711 DICTIONARY command in SPSS documentation. */
715 /* DATE variable information. We don't use it yet, but we
720 /* Unknown purpose. */
724 read_display_parameters (r, size, count, dict);
728 read_long_var_name_map (r, size, count, dict);
732 read_long_string_map (r, size, count, dict);
736 /* New in SPSS v14? Unknown purpose. */
740 /* Text field that defines variable attributes. New in
745 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
749 skip_bytes (r, bytes);
752 /* Read record type 7, subtype 3. */
754 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
756 int version_major UNUSED = read_int32 (r);
757 int version_minor UNUSED = read_int32 (r);
758 int version_revision UNUSED = read_int32 (r);
759 int machine_code UNUSED = read_int32 (r);
760 int float_representation = read_int32 (r);
761 int compression_code UNUSED = read_int32 (r);
762 int integer_representation = read_int32 (r);
763 int character_code UNUSED = read_int32 (r);
765 int expected_float_format;
766 int expected_integer_format;
768 if (size != 4 || count != 8)
769 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
771 (unsigned int) size, (unsigned int) count);
773 /* Check floating point format. */
774 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
775 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
776 expected_float_format = 1;
777 else if (r->float_format == FLOAT_Z_LONG)
778 expected_float_format = 2;
779 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
780 expected_float_format = 3;
783 if (float_representation != expected_float_format)
784 sys_error (r, _("Floating-point representation indicated by "
785 "system file (%d) differs from expected (%d)."),
786 r->float_format, expected_float_format);
788 /* Check integer format. */
789 if (r->integer_format == INTEGER_MSB_FIRST)
790 expected_integer_format = 1;
791 else if (r->integer_format == INTEGER_LSB_FIRST)
792 expected_integer_format = 2;
795 if (integer_representation != expected_integer_format)
797 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
798 sys_warn (r, _("Integer format indicated by system file (%s) "
799 "differs from expected (%s)."),
800 gettext (endian[integer_representation == 1]),
801 gettext (endian[expected_integer_format == 1]));
805 /* Read record type 7, subtype 4. */
807 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
809 double sysmis = read_flt64 (r);
810 double highest = read_flt64 (r);
811 double lowest = read_flt64 (r);
813 if (size != 8 || count != 3)
814 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
815 (unsigned int) size, (unsigned int) count);
817 if (sysmis != SYSMIS)
818 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
819 if (highest != HIGHEST)
820 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
821 if (lowest != LOWEST)
822 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
825 /* Read record type 7, subtype 11, which specifies how variables
826 should be displayed in GUI environments. */
828 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
829 struct dictionary *dict)
831 const size_t n_vars = count / 3 ;
835 if (count % 3 || n_vars != dict_get_var_cnt (dict))
836 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
837 (unsigned int) size, (unsigned int) count);
839 for (i = 0; i < n_vars; ++i)
841 int measure = read_int32 (r);
842 int width = read_int32 (r);
843 int align = read_int32 (r);
844 struct variable *v = dict_get_var (dict, i);
846 /* spss v14 sometimes seems to set string variables' measure to zero */
847 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
850 if (measure < 1 || measure > 3 || align < 0 || align > 2)
853 sys_warn (r, _("Invalid variable display parameters. "
854 "Default parameters substituted."));
859 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
860 : measure == 2 ? MEASURE_ORDINAL
862 var_set_display_width (v, width);
863 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
864 : align == 1 ? ALIGN_RIGHT
869 /* Reads record type 7, subtype 13, which gives the long name
870 that corresponds to each short name. Modifies variable names
871 in DICT accordingly. */
873 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
874 struct dictionary *dict)
876 struct variable_to_value_map *map;
877 struct variable *var;
881 map = open_variable_to_value_map (r, size * count);
882 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
885 char short_name[SHORT_NAME_LEN + 1];
886 strcpy (short_name, var_get_short_name (var));
888 /* Validate long name. */
889 if (!var_is_valid_name (long_name, false))
891 sys_warn (r, _("Long variable mapping from %s to invalid "
892 "variable name `%s'."),
893 var_get_name (var), long_name);
897 /* Identify any duplicates. */
898 if (strcasecmp (short_name, long_name)
899 && dict_lookup_var (dict, long_name) != NULL)
901 sys_warn (r, _("Duplicate long variable name `%s' "
902 "within system file."), long_name);
906 /* Set long name. Renaming a variable may clear the short
907 name, but we want to retain it, so re-set it
909 dict_rename_var (dict, var, long_name);
910 var_set_short_name (var, short_name);
912 close_variable_to_value_map (r, map);
913 r->has_long_var_names = true;
916 /* Reads record type 7, subtype 14, which gives the real length
917 of each very long string. Rearranges DICT accordingly. */
919 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
920 struct dictionary *dict)
922 struct variable_to_value_map *map;
923 struct variable *var;
929 map = open_variable_to_value_map (r, size * count);
930 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
933 long length, remaining_length;
937 length = strtol (length_s, NULL, 10);
938 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
940 sys_warn (r, _("%s listed as string of length %s "
942 var_get_name (var), length_s);
946 /* Group multiple variables into single variable
947 and delete all but the first. */
948 remaining_length = length;
949 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
950 if (idx < dict_get_var_cnt (dict))
951 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
952 EFFECTIVE_LONG_STRING_LENGTH);
954 sys_error (r, _("Very long string %s overflows dictionary."),
956 dict_delete_consecutive_vars (dict,
957 var_get_dict_index (var) + 1,
958 idx - var_get_dict_index (var) - 1);
960 /* Assign all the length to the first variable. */
961 var_set_width (var, length);
963 close_variable_to_value_map (r, map);
964 dict_compact_values (dict);
967 /* Reads value labels from sysfile H and inserts them into the
968 associated dictionary. */
970 read_value_labels (struct sfm_reader *r,
971 struct dictionary *dict, struct variable **var_by_value_idx)
973 struct pool *subpool;
977 char raw_value[8]; /* Value as uninterpreted bytes. */
978 union value value; /* Value. */
979 char *label; /* Null-terminated label string. */
982 struct label *labels = NULL;
983 int label_cnt; /* Number of labels. */
985 struct variable **var = NULL; /* Associated variables. */
986 int var_cnt; /* Number of associated variables. */
990 subpool = pool_create_subpool (r->pool);
992 /* Read the type 3 record and record its contents. We can't do
993 much with the data yet because we don't know whether it is
994 of numeric or string type. */
996 /* Read number of labels. */
997 label_cnt = read_int32 (r);
999 if (label_cnt >= INT32_MAX / sizeof *labels)
1001 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1006 /* Read each value/label tuple into labels[]. */
1007 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1008 for (i = 0; i < label_cnt; i++)
1010 struct label *label = labels + i;
1011 unsigned char label_len;
1015 read_bytes (r, label->raw_value, sizeof label->raw_value);
1017 /* Read label length. */
1018 read_bytes (r, &label_len, sizeof label_len);
1019 padded_len = ROUND_UP (label_len + 1, 8);
1021 /* Read label, padding. */
1022 label->label = pool_alloc (subpool, padded_len + 1);
1023 read_bytes (r, label->label, padded_len - 1);
1024 label->label[label_len] = 0;
1027 /* Now, read the type 4 record that has the list of variables
1028 to which the value labels are to be applied. */
1030 /* Read record type of type 4 record. */
1031 if (read_int32 (r) != 4)
1032 sys_error (r, _("Variable index record (type 4) does not immediately "
1033 "follow value label record (type 3) as it should."));
1035 /* Read number of variables associated with value label from type 4
1037 var_cnt = read_int32 (r);
1038 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1039 sys_error (r, _("Number of variables associated with a value label (%d) "
1040 "is not between 1 and the number of variables (%u)."),
1041 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1043 /* Read the list of variables. */
1044 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1045 for (i = 0; i < var_cnt; i++)
1047 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1048 if (var_is_long_string (var[i]))
1049 sys_error (r, _("Value labels are not allowed on long string "
1050 "variables (%s)."), var_get_name (var[i]));
1053 /* Type check the variables. */
1054 for (i = 1; i < var_cnt; i++)
1055 if (var_get_type (var[i]) != var_get_type (var[0]))
1056 sys_error (r, _("Variables associated with value label are not all of "
1057 "identical type. Variable %s is %s, but variable "
1059 var_get_name (var[0]),
1060 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1061 var_get_name (var[i]),
1062 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1064 /* Fill in labels[].value, now that we know the desired type. */
1065 for (i = 0; i < label_cnt; i++)
1067 struct label *label = labels + i;
1069 if (var_is_alpha (var[0]))
1070 buf_copy_rpad (label->value.s, sizeof label->value.s,
1071 label->raw_value, sizeof label->raw_value);
1073 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1076 /* Assign the `value_label's to each variable. */
1077 for (i = 0; i < var_cnt; i++)
1079 struct variable *v = var[i];
1082 /* Add each label to the variable. */
1083 for (j = 0; j < label_cnt; j++)
1085 struct label *label = &labels[j];
1086 if (!var_add_value_label (v, &label->value, label->label))
1088 if (var_is_numeric (var[0]))
1089 sys_warn (r, _("Duplicate value label for %g on %s."),
1090 label->value.f, var_get_name (v));
1092 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1093 var_get_width (v), label->value.s,
1099 pool_destroy (subpool);
1104 static void partial_record (struct sfm_reader *r)
1106 static bool read_case_number (struct sfm_reader *, double *);
1107 static bool read_case_string (struct sfm_reader *, char *, size_t);
1108 static int read_opcode (struct sfm_reader *);
1109 static bool read_compressed_number (struct sfm_reader *, double *);
1110 static bool read_compressed_string (struct sfm_reader *, char *);
1111 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1113 /* Reads one case from READER's file into C. Returns nonzero
1114 only if successful. */
1116 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1121 if (setjmp (r->bail_out))
1124 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1126 /* Fast path. Read the whole case directly. */
1127 if (!try_read_bytes (r, case_data_all_rw (c),
1128 sizeof (union value) * r->value_cnt))
1131 /* Convert floating point numbers to native format if needed. */
1132 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1136 for (i = 0; i < r->var_cnt; i++)
1137 if (r->vars[i].width == 0)
1139 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1140 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1147 /* Slow path. Convert from external to internal format. */
1150 for (i = 0; i < r->var_cnt; i++)
1152 struct sfm_var *sv = &r->vars[i];
1153 union value *v = case_data_rw_idx (c, sv->case_index);
1157 if (!read_case_number (r, &v->f))
1162 /* Read the string data in segments up to 255 bytes
1163 at a time, packed into 8-byte units. */
1164 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1165 int ofs, chunk_size;
1166 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1168 chunk_size = MIN (max_chunk, sv->width - ofs);
1169 if (!read_case_string (r, v->s + ofs, chunk_size))
1177 /* Very long strings have trailing wasted space
1178 that we must skip. */
1179 if (sv->width >= MIN_VERY_LONG_STRING)
1181 int bytes_read = (sv->width / max_chunk * 256
1182 + ROUND_UP (sv->width % max_chunk, 8));
1183 int total_bytes = sfm_width_to_bytes (sv->width);
1184 int excess_bytes = total_bytes - bytes_read;
1186 while (excess_bytes > 0)
1189 size_t chunk = MIN (sizeof buffer, excess_bytes);
1190 if (!read_whole_strings (r, buffer, chunk))
1192 excess_bytes -= chunk;
1206 /* Issues an error that R ends in a partial record. */
1208 partial_record (struct sfm_reader *r)
1210 sys_error (r, _("File ends in partial case."));
1213 /* Reads a number from R and stores its value in *D.
1214 If R is compressed, reads a compressed number;
1215 otherwise, reads a number in the regular way.
1216 Returns true if successful, false if end of file is
1217 reached immediately. */
1219 read_case_number (struct sfm_reader *r, double *d)
1224 if (!try_read_bytes (r, flt64, sizeof flt64))
1226 *d = flt64_to_double (r, flt64);
1230 return read_compressed_number (r, d);
1233 /* Reads LENGTH string bytes from R into S.
1234 Always reads a multiple of 8 bytes; if LENGTH is not a
1235 multiple of 8, then extra bytes are read and discarded without
1237 Reads compressed strings if S is compressed.
1238 Returns true if successful, false if end of file is
1239 reached immediately. */
1241 read_case_string (struct sfm_reader *r, char *s, size_t length)
1243 size_t whole = ROUND_DOWN (length, 8);
1244 size_t partial = length % 8;
1248 if (!read_whole_strings (r, s, whole))
1255 if (!read_whole_strings (r, bounce, sizeof bounce))
1261 memcpy (s + whole, bounce, partial);
1267 /* Reads and returns the next compression opcode from R. */
1269 read_opcode (struct sfm_reader *r)
1271 assert (r->compressed);
1275 if (r->opcode_idx >= sizeof r->opcodes)
1277 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1281 opcode = r->opcodes[r->opcode_idx++];
1288 /* Reads a compressed number from R and stores its value in D.
1289 Returns true if successful, false if end of file is
1290 reached immediately. */
1292 read_compressed_number (struct sfm_reader *r, double *d)
1294 int opcode = read_opcode (r);
1302 *d = read_flt64 (r);
1306 sys_error (r, _("Compressed data is corrupt."));
1313 *d = opcode - r->bias;
1320 /* Reads a compressed 8-byte string segment from R and stores it
1322 Returns true if successful, false if end of file is
1323 reached immediately. */
1325 read_compressed_string (struct sfm_reader *r, char *dst)
1327 switch (read_opcode (r))
1334 read_bytes (r, dst, 8);
1338 memset (dst, ' ', 8);
1342 sys_error (r, _("Compressed data is corrupt."));
1348 /* Reads LENGTH string bytes from R into S.
1349 LENGTH must be a multiple of 8.
1350 Reads compressed strings if S is compressed.
1351 Returns true if successful, false if end of file is
1352 reached immediately. */
1354 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1356 assert (length % 8 == 0);
1358 return try_read_bytes (r, s, length);
1362 for (ofs = 0; ofs < length; ofs += 8)
1363 if (!read_compressed_string (r, s + ofs))
1373 /* Creates and returns a table that can be used for translating a value
1374 index into a case to a "struct variable *" for DICT. Multiple
1375 system file fields reference variables this way.
1377 This table must be created before processing the very long
1378 string extension record, because that record causes some
1379 values to be deleted from the case and the dictionary to be
1381 static struct variable **
1382 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1384 struct variable **var_by_value_idx;
1388 var_by_value_idx = pool_nmalloc (r->pool,
1389 r->value_cnt, sizeof *var_by_value_idx);
1390 for (i = 0; i < dict_get_var_cnt (dict); i++)
1392 struct variable *v = dict_get_var (dict, i);
1393 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1396 var_by_value_idx[value_idx++] = v;
1397 for (j = 1; j < nv; j++)
1398 var_by_value_idx[value_idx++] = NULL;
1400 assert (value_idx == r->value_cnt);
1402 return var_by_value_idx;
1405 /* Returns the "struct variable" corresponding to the given
1406 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1408 static struct variable *
1409 lookup_var_by_value_idx (struct sfm_reader *r,
1410 struct variable **var_by_value_idx, int value_idx)
1412 struct variable *var;
1414 if (value_idx < 1 || value_idx > r->value_cnt)
1415 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1416 value_idx, r->value_cnt);
1418 var = var_by_value_idx[value_idx - 1];
1420 sys_error (r, _("Variable index %d refers to long string "
1427 /* Returns the variable in D with the given SHORT_NAME,
1428 or a null pointer if there is none. */
1429 static struct variable *
1430 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1432 struct variable *var;
1436 /* First try looking up by full name. This often succeeds. */
1437 var = dict_lookup_var (d, short_name);
1438 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1441 /* Iterate through the whole dictionary as a fallback. */
1442 var_cnt = dict_get_var_cnt (d);
1443 for (i = 0; i < var_cnt; i++)
1445 var = dict_get_var (d, i);
1446 if (!strcasecmp (var_get_short_name (var), short_name))
1453 /* Helpers for reading records that contain "variable=value"
1457 struct variable_to_value_map
1459 struct substring buffer; /* Record contents. */
1460 size_t pos; /* Current position in buffer. */
1463 /* Reads SIZE bytes into a "variable=value" map for R,
1464 and returns the map. */
1465 static struct variable_to_value_map *
1466 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1468 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1469 char *buffer = pool_malloc (r->pool, size + 1);
1470 read_bytes (r, buffer, size);
1471 map->buffer = ss_buffer (buffer, size);
1476 /* Closes MAP and frees its storage.
1477 Not really needed, because the pool will free the map anyway,
1478 but can be used to free it earlier. */
1480 close_variable_to_value_map (struct sfm_reader *r,
1481 struct variable_to_value_map *map)
1483 pool_free (r->pool, ss_data (map->buffer));
1486 /* Reads the next variable=value pair from MAP.
1487 Looks up the variable in DICT and stores it into *VAR.
1488 Stores a null-terminated value into *VALUE. */
1490 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1491 struct variable_to_value_map *map,
1492 struct variable **var, char **value,
1495 int max_warnings = 5;
1499 struct substring short_name_ss, value_ss;
1501 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1502 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1505 if (*warning_cnt > max_warnings)
1506 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1507 *warning_cnt - max_warnings);
1511 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1512 ss_buffer ("\t\0", 2));
1514 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1515 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1518 if (++*warning_cnt <= 5)
1519 sys_warn (r, _("Variable map refers to unknown variable %s."),
1520 ss_data (short_name_ss));
1524 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1525 *value = ss_data (value_ss);
1533 /* Displays a corruption message. */
1535 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1540 ds_init_empty (&text);
1541 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1542 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1543 ds_put_vformat (&text, format, args);
1545 m.category = msg_class_to_category (class);
1546 m.severity = msg_class_to_severity (class);
1547 m.where.file_name = NULL;
1548 m.where.line_number = 0;
1549 m.text = ds_cstr (&text);
1554 /* Displays a warning for the current file position. */
1556 sys_warn (struct sfm_reader *r, const char *format, ...)
1560 va_start (args, format);
1561 sys_msg (r, MW, format, args);
1565 /* Displays an error for the current file position,
1566 marks it as in an error state,
1567 and aborts reading it using longjmp. */
1569 sys_error (struct sfm_reader *r, const char *format, ...)
1573 va_start (args, format);
1574 sys_msg (r, ME, format, args);
1578 longjmp (r->bail_out, 1);
1581 /* Reads BYTE_CNT bytes into BUF.
1582 Returns true if exactly BYTE_CNT bytes are successfully read.
1583 Aborts if an I/O error or a partial read occurs.
1584 If EOF_IS_OK, then an immediate end-of-file causes false to be
1585 returned; otherwise, immediate end-of-file causes an abort
1588 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1589 void *buf, size_t byte_cnt)
1591 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1592 if (bytes_read == byte_cnt)
1594 else if (ferror (r->file))
1595 sys_error (r, _("System error: %s."), strerror (errno));
1596 else if (!eof_is_ok || bytes_read != 0)
1597 sys_error (r, _("Unexpected end of file."));
1602 /* Reads BYTE_CNT into BUF.
1603 Aborts upon I/O error or if end-of-file is encountered. */
1605 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1607 read_bytes_internal (r, false, buf, byte_cnt);
1610 /* Reads BYTE_CNT bytes into BUF.
1611 Returns true if exactly BYTE_CNT bytes are successfully read.
1612 Returns false if an immediate end-of-file is encountered.
1613 Aborts if an I/O error or a partial read occurs. */
1615 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1617 return read_bytes_internal (r, true, buf, byte_cnt);
1620 /* Reads a 32-bit signed integer from R and returns its value in
1623 read_int32 (struct sfm_reader *r)
1626 read_bytes (r, int32, sizeof int32);
1627 return int32_to_native (r, int32);
1630 /* Reads a 64-bit floating-point number from R and returns its
1631 value in host format. */
1633 read_flt64 (struct sfm_reader *r)
1636 read_bytes (r, flt64, sizeof flt64);
1637 return flt64_to_double (r, flt64);
1640 /* Reads exactly SIZE - 1 bytes into BUFFER
1641 and stores a null byte into BUFFER[SIZE - 1]. */
1643 read_string (struct sfm_reader *r, char *buffer, size_t size)
1646 read_bytes (r, buffer, size - 1);
1647 buffer[size - 1] = '\0';
1650 /* Skips BYTES bytes forward in R. */
1652 skip_bytes (struct sfm_reader *r, size_t bytes)
1657 size_t chunk = MIN (sizeof buffer, bytes);
1658 read_bytes (r, buffer, chunk);
1663 /* Returns the value of the 32-bit signed integer at INT32,
1664 converted from the format used by R to the host format. */
1666 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1669 if (r->integer_format == INTEGER_NATIVE)
1670 memcpy (&x, int32, sizeof x);
1672 x = integer_get (r->integer_format, int32, sizeof x);
1676 /* Returns the value of the 64-bit floating point number at
1677 FLT64, converted from the format used by R to the host
1680 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1683 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1684 memcpy (&x, flt64, sizeof x);
1686 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);