1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include "sys-file-reader.h"
23 #include "sys-file-private.h"
31 #include <libpspp/alloc.h>
32 #include <libpspp/assertion.h>
33 #include <libpspp/message.h>
34 #include <libpspp/compiler.h>
35 #include <libpspp/magic.h>
36 #include <libpspp/misc.h>
37 #include <libpspp/pool.h>
38 #include <libpspp/str.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/array.h>
43 #include "dictionary.h"
44 #include "file-handle-def.h"
45 #include "file-name.h"
47 #include "missing-values.h"
48 #include "value-labels.h"
55 #include "unlocked-io.h"
59 #define _(msgid) gettext (msgid)
60 #define N_(msgid) (msgid)
62 /* System file reader. */
65 /* Resource tracking. */
66 struct pool *pool; /* All system file state. */
67 jmp_buf bail_out; /* longjmp() target for error handling. */
70 struct file_handle *fh; /* File handle. */
71 FILE *file; /* File stream. */
72 bool error; /* I/O or corruption error? */
75 enum integer_format integer_format; /* On-disk integer format. */
76 enum float_format float_format; /* On-disk floating point format. */
77 int value_cnt; /* Number of 8-byte units per case. */
78 struct sfm_var *vars; /* Variables. */
79 size_t var_cnt; /* Number of variables. */
80 bool has_vls; /* File has one or more very long strings? */
83 bool compressed; /* File is compressed? */
84 double bias; /* Compression bias, usually 100.0. */
85 uint8_t opcodes[8]; /* Current block of opcodes. */
86 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
89 /* A variable in a system file. */
92 int width; /* 0=numeric, otherwise string width. */
93 int case_index; /* Index into case. */
96 static struct variable **make_var_by_value_idx (struct sfm_reader *,
98 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
102 static void sys_warn (struct sfm_reader *, const char *, ...)
103 PRINTF_FORMAT (2, 3);
105 static void sys_error (struct sfm_reader *, const char *, ...)
109 static void read_bytes (struct sfm_reader *, void *, size_t);
110 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
111 static int32_t read_int32 (struct sfm_reader *);
112 static double read_flt64 (struct sfm_reader *);
113 static void read_string (struct sfm_reader *, char *, size_t);
114 static void skip_bytes (struct sfm_reader *, size_t);
116 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
117 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
119 static struct variable_to_value_map *open_variable_to_value_map (
120 struct sfm_reader *, size_t size);
121 static void close_variable_to_value_map (struct sfm_reader *r,
122 struct variable_to_value_map *);
123 static bool read_variable_to_value_map (struct sfm_reader *,
125 struct variable_to_value_map *,
126 struct variable **var, char **value,
129 /* Dictionary reader. */
137 static void read_header (struct sfm_reader *, struct dictionary *,
138 int *weight_idx, int *claimed_value_cnt,
139 struct sfm_read_info *);
140 static void read_variable_record (struct sfm_reader *, struct dictionary *,
141 int *format_warning_cnt);
142 static void parse_format_spec (struct sfm_reader *, uint32_t,
143 enum which_format, struct variable *,
144 int *format_warning_cnt);
145 static void setup_weight (struct sfm_reader *, int weight_idx,
146 struct variable **var_by_value_idx,
147 struct dictionary *);
148 static void read_documents (struct sfm_reader *, struct dictionary *);
149 static void read_value_labels (struct sfm_reader *, struct dictionary *,
150 struct variable **var_by_value_idx);
152 static void read_extension_record (struct sfm_reader *, struct dictionary *);
153 static void read_machine_int32_info (struct sfm_reader *,
154 size_t size, size_t count);
155 static void read_machine_flt64_info (struct sfm_reader *,
156 size_t size, size_t count);
157 static void read_display_parameters (struct sfm_reader *,
158 size_t size, size_t count,
159 struct dictionary *);
160 static void read_long_var_name_map (struct sfm_reader *,
161 size_t size, size_t count,
162 struct dictionary *);
163 static void read_long_string_map (struct sfm_reader *,
164 size_t size, size_t count,
165 struct dictionary *);
168 /* Opens the system file designated by file handle FH for
169 reading. Reads the system file's dictionary into *DICT.
170 If INFO is non-null, then it receives additional info about the
173 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
174 struct sfm_read_info *info)
176 struct sfm_reader *volatile r = NULL;
177 struct variable **var_by_value_idx;
178 int format_warning_cnt = 0;
180 int claimed_value_cnt;
184 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
187 *dict = dict_create ();
189 /* Create and initialize reader. */
190 r = pool_create_container (struct sfm_reader, pool);
192 r->file = fn_open (fh_get_file_name (fh), "rb");
196 r->opcode_idx = sizeof r->opcodes;
198 if (setjmp (r->bail_out))
200 sfm_close_reader (r);
201 dict_destroy (*dict);
208 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
209 fh_get_file_name (r->fh), strerror (errno));
210 longjmp (r->bail_out, 1);
214 read_header (r, *dict, &weight_idx, &claimed_value_cnt, info);
216 /* Read all the variable definition records. */
217 rec_type = read_int32 (r);
218 while (rec_type == 2)
220 read_variable_record (r, *dict, &format_warning_cnt);
221 rec_type = read_int32 (r);
224 /* Figure out the case format. */
225 var_by_value_idx = make_var_by_value_idx (r, *dict);
226 setup_weight (r, weight_idx, var_by_value_idx, *dict);
228 /* Read all the rest of the dictionary records. */
229 while (rec_type != 999)
234 read_value_labels (r, *dict, var_by_value_idx);
238 sys_error (r, _("Misplaced type 4 record."));
241 read_documents (r, *dict);
245 read_extension_record (r, *dict);
249 sys_error (r, _("Unrecognized record type %d."), rec_type);
251 rec_type = read_int32 (r);
254 /* Read record 999 data, which is just filler. */
257 if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt)
258 sys_warn (r, _("File header claims %d variable positions but "
259 "%d were read from file."),
260 claimed_value_cnt, r->value_cnt);
262 /* Create an index of dictionary variable widths for
263 sfm_read_case to use. We cannot use the `struct variable's
264 from the dictionary we created, because the caller owns the
265 dictionary and may destroy or modify its variables. */
266 r->var_cnt = dict_get_var_cnt (*dict);
267 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
268 for (i = 0; i < r->var_cnt; i++)
270 struct variable *v = dict_get_var (*dict, i);
271 struct sfm_var *sv = &r->vars[i];
272 sv->width = var_get_width (v);
273 sv->case_index = var_get_case_index (v);
276 pool_free (r->pool, var_by_value_idx);
280 /* Closes a system file after we're done with it. */
282 sfm_close_reader (struct sfm_reader *r)
289 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
290 msg (ME, _("Error closing system file \"%s\": %s."),
291 fh_get_file_name (r->fh), strerror (errno));
296 fh_close (r->fh, "system file", "rs");
298 pool_destroy (r->pool);
301 /* Returns true if an I/O error has occurred on READER, false
304 sfm_read_error (const struct sfm_reader *reader)
306 return reader->error;
309 /* Returns true if FILE is an SPSS system file,
312 sfm_detect (FILE *file)
316 if (fread (rec_type, 4, 1, file) != 1)
320 return !strcmp ("$FL2", rec_type);
323 /* Reads the global header of the system file.
324 Sets DICT's file label to the system file's label.
325 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
326 or to the value index of the weight variable otherwise.
327 Sets *CLAIMED_VALUE_CNT to the number of values that the file
328 claims to have (although it is not always correct).
329 If INFO is non-null, initializes *INFO with header
332 read_header (struct sfm_reader *r, struct dictionary *dict,
333 int *weight_idx, int *claimed_value_cnt,
334 struct sfm_read_info *info)
337 char eye_catcher[61];
338 uint8_t raw_layout_code[4];
341 char creation_date[10];
342 char creation_time[9];
344 struct substring file_label_ss;
346 read_string (r, rec_type, sizeof rec_type);
347 read_string (r, eye_catcher, sizeof eye_catcher);
349 if (strcmp ("$FL2", rec_type) != 0)
350 sys_error (r, _("This is not an SPSS system file."));
352 /* Identify integer format. */
353 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
354 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
356 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
358 || (r->integer_format != INTEGER_MSB_FIRST
359 && r->integer_format != INTEGER_LSB_FIRST))
360 sys_error (r, _("This is not an SPSS system file."));
362 *claimed_value_cnt = read_int32 (r);
363 if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16)
364 *claimed_value_cnt = -1;
366 r->compressed = read_int32 (r) != 0;
368 *weight_idx = read_int32 (r);
370 case_cnt = read_int32 (r);
371 if (case_cnt < -1 || case_cnt > INT_MAX / 2)
374 /* Identify floating-point format and obtain compression bias. */
375 read_bytes (r, raw_bias, sizeof raw_bias);
376 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
378 sys_warn (r, _("Compression bias (%g) is not the usual "
379 "value of 100, or system file uses unrecognized "
380 "floating-point format."),
382 if (r->integer_format == INTEGER_MSB_FIRST)
383 r->float_format = FLOAT_IEEE_DOUBLE_BE;
385 r->float_format = FLOAT_IEEE_DOUBLE_LE;
387 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
389 read_string (r, creation_date, sizeof creation_date);
390 read_string (r, creation_time, sizeof creation_time);
391 read_string (r, file_label, sizeof file_label);
394 file_label_ss = ss_cstr (file_label);
395 ss_trim (&file_label_ss, ss_cstr (" "));
396 if (!ss_is_empty (file_label_ss))
398 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
399 dict_set_label (dict, ss_data (file_label_ss));
404 struct substring product;
406 strcpy (info->creation_date, creation_date);
407 strcpy (info->creation_time, creation_time);
408 info->integer_format = r->integer_format;
409 info->float_format = r->float_format;
410 info->compressed = r->compressed;
411 info->case_cnt = case_cnt;
413 product = ss_cstr (eye_catcher);
414 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
415 ss_trim (&product, ss_cstr (" "));
416 str_copy_buf_trunc (info->product, sizeof info->product,
417 ss_data (product), ss_length (product));
421 /* Reads a variable (type 2) record from R and adds the
422 corresponding variable to DICT.
423 Also skips past additional variable records for long string
426 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
427 int *format_warning_cnt)
430 int has_variable_label;
431 int missing_value_code;
436 struct variable *var;
439 width = read_int32 (r);
440 has_variable_label = read_int32 (r);
441 missing_value_code = read_int32 (r);
442 print_format = read_int32 (r);
443 write_format = read_int32 (r);
444 read_string (r, name, sizeof name);
445 name[strcspn (name, " ")] = '\0';
447 /* Check variable name. */
448 if (name[0] == '$' || name[0] == '#')
449 sys_error (r, "Variable name begins with invalid character `%c'.",
451 if (!var_is_plausible_name (name, false))
452 sys_error (r, _("Invalid variable name `%s'."), name);
454 /* Create variable. */
455 if (width < 0 || width > 255)
456 sys_error (r, _("Bad variable width %d."), width);
457 var = dict_create_var (dict, name, width);
460 _("Duplicate variable name `%s' within system file."),
463 /* Set the short name the same as the long name */
464 var_set_short_name (var, var_get_name (var));
466 /* Get variable label, if any. */
467 if (has_variable_label != 0 && has_variable_label != 1)
468 sys_error (r, _("Variable label indicator field is not 0 or 1."));
469 if (has_variable_label == 1)
474 len = read_int32 (r);
475 if (len >= sizeof label)
476 sys_error (r, _("Variable %s has label of invalid length %d."),
478 read_string (r, label, len + 1);
479 var_set_label (var, label);
481 skip_bytes (r, ROUND_UP (len, 4) - len);
484 /* Set missing values. */
485 if (missing_value_code < -3 || missing_value_code > 3
486 || missing_value_code == -1)
487 sys_error (r, _("Missing value indicator field is not "
488 "-3, -2, 0, 1, 2, or 3."));
489 if (missing_value_code != 0)
491 struct missing_values mv;
492 mv_init (&mv, var_get_width (var));
493 if (var_is_numeric (var))
495 if (missing_value_code > 0)
498 for (i = 0; i < missing_value_code; i++)
499 mv_add_num (&mv, read_flt64 (r));
503 double low = read_flt64 (r);
504 double high = read_flt64 (r);
505 mv_add_num_range (&mv, low, high);
506 if (missing_value_code == -3)
507 mv_add_num (&mv, read_flt64 (r));
510 else if (var_get_width (var) <= MAX_SHORT_STRING)
512 if (missing_value_code > 0)
515 for (i = 0; i < missing_value_code; i++)
518 read_string (r, string, sizeof string);
519 mv_add_str (&mv, string);
523 sys_error (r, _("String variable %s may not have missing "
524 "values specified as a range."),
527 else /* var->width > MAX_SHORT_STRING */
528 sys_error (r, _("Long string variable %s may not have missing "
531 var_set_missing_values (var, &mv);
535 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
536 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
538 /* Account for values.
539 Skip long string continuation records, if any. */
540 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
546 for (i = 1; i < nv; i++)
548 /* Check for record type 2 and width -1. */
549 if (read_int32 (r) != 2 || read_int32 (r) != -1)
550 sys_error (r, _("Missing string continuation record."));
552 /* Skip and ignore remaining continuation data. */
553 has_variable_label = read_int32 (r);
554 missing_value_code = read_int32 (r);
555 print_format = read_int32 (r);
556 write_format = read_int32 (r);
557 read_string (r, name, sizeof name);
559 /* Variable label fields on continuation records have
560 been spotted in system files created by "SPSS Power
561 Macintosh Release 6.1". */
562 if (has_variable_label)
563 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
568 /* Translates the format spec from sysfile format to internal
571 parse_format_spec (struct sfm_reader *r, uint32_t s,
572 enum which_format which, struct variable *v,
573 int *format_warning_cnt)
575 const int max_format_warnings = 8;
577 uint8_t raw_type = s >> 16;
583 if (!fmt_from_io (raw_type, &f.type))
584 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
589 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
594 if (which == PRINT_FORMAT)
595 var_set_print_format (v, &f);
597 var_set_write_format (v, &f);
599 else if (*++format_warning_cnt <= max_format_warnings)
601 char fmt_string[FMT_STRING_LEN_MAX + 1];
602 sys_warn (r, _("%s variable %s has invalid %s format %s."),
603 var_is_numeric (v) ? _("Numeric") : _("String"),
605 which == PRINT_FORMAT ? _("print") : _("write"),
606 fmt_to_string (&f, fmt_string));
608 if (*format_warning_cnt == max_format_warnings)
609 sys_warn (r, _("Suppressing further invalid format warnings."));
613 /* Sets the weighting variable in DICT to the variable
614 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
617 setup_weight (struct sfm_reader *r, int weight_idx,
618 struct variable **var_by_value_idx, struct dictionary *dict)
622 struct variable *weight_var
623 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
624 if (var_is_numeric (weight_var))
625 dict_set_weight (dict, weight_var);
627 sys_error (r, _("Weighting variable must be numeric."));
631 /* Reads a document record, type 6, from system file R, and sets up
632 the documents and n_documents fields in the associated
635 read_documents (struct sfm_reader *r, struct dictionary *dict)
640 if (dict_get_documents (dict) != NULL)
641 sys_error (r, _("Multiple type 6 (document) records."));
643 line_cnt = read_int32 (r);
645 sys_error (r, _("Number of document lines (%d) "
646 "must be greater than 0."), line_cnt);
648 documents = pool_nmalloc (r->pool, line_cnt + 1, 80);
649 read_string (r, documents, 80 * line_cnt + 1);
650 dict_set_documents (dict, documents);
651 pool_free (r->pool, documents);
654 /* Read a type 7 extension record. */
656 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
658 int subtype = read_int32 (r);
659 size_t size = read_int32 (r);
660 size_t count = read_int32 (r);
661 size_t bytes = size * count;
663 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
664 allows an extra byte for a null terminator, used by some
665 extension processing routines. */
666 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
667 sys_error (r, "Record type 7 subtype %d too large.", subtype);
672 read_machine_int32_info (r, size, count);
676 read_machine_flt64_info (r, size, count);
680 /* Variable sets information. We don't use these yet.
681 They only apply to GUIs; see VARSETS on the APPLY
682 DICTIONARY command in SPSS documentation. */
686 /* DATE variable information. We don't use it yet, but we
691 /* Unknown purpose. */
695 read_display_parameters (r, size, count, dict);
699 read_long_var_name_map (r, size, count, dict);
703 read_long_string_map (r, size, count, dict);
707 /* New in SPSS v14? Unknown purpose. */
711 /* Text field that defines variable attributes. New in
716 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
720 skip_bytes (r, bytes);
723 /* Read record type 7, subtype 3. */
725 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
727 int version_major UNUSED = read_int32 (r);
728 int version_minor UNUSED = read_int32 (r);
729 int version_revision UNUSED = read_int32 (r);
730 int machine_code UNUSED = read_int32 (r);
731 int float_representation = read_int32 (r);
732 int compression_code UNUSED = read_int32 (r);
733 int integer_representation = read_int32 (r);
734 int character_code UNUSED = read_int32 (r);
736 int expected_float_format;
737 int expected_integer_format;
739 if (size != 4 || count != 8)
740 sys_error (r, _("Bad size (%d) or count (%d) field on record type 7, "
744 /* Check floating point format. */
745 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
746 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
747 expected_float_format = 1;
748 else if (r->float_format == FLOAT_Z_LONG)
749 expected_float_format = 2;
750 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
751 expected_float_format = 3;
754 if (float_representation != expected_float_format)
755 sys_error (r, _("Floating-point representation indicated by "
756 "system file (%d) differs from expected (%d)."),
757 r->float_format, expected_float_format);
759 /* Check integer format. */
760 if (r->integer_format == INTEGER_MSB_FIRST)
761 expected_integer_format = 1;
762 else if (r->integer_format == INTEGER_LSB_FIRST)
763 expected_integer_format = 2;
766 if (integer_representation != expected_integer_format)
768 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
769 sys_warn (r, _("Integer format indicated by system file (%s) "
770 "differs from expected (%s)."),
771 gettext (endian[integer_representation == 1]),
772 gettext (endian[expected_integer_format == 1]));
776 /* Read record type 7, subtype 4. */
778 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
780 double sysmis = read_flt64 (r);
781 double highest = read_flt64 (r);
782 double lowest = read_flt64 (r);
784 if (size != 8 || count != 3)
785 sys_error (r, _("Bad size (%d) or count (%d) on extension 4."),
788 if (sysmis != SYSMIS)
789 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
790 if (highest != HIGHEST)
791 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
792 if (lowest != LOWEST)
793 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
796 /* Read record type 7, subtype 11, which specifies how variables
797 should be displayed in GUI environments. */
799 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
800 struct dictionary *dict)
802 const size_t n_vars = count / 3 ;
806 if (count % 3 || n_vars != dict_get_var_cnt (dict))
807 sys_error (r, _("Bad size (%d) or count (%d) on extension 11."),
810 for (i = 0; i < n_vars; ++i)
812 int measure = read_int32 (r);
813 int width = read_int32 (r);
814 int align = read_int32 (r);
817 if (!measure_is_valid (measure) || !alignment_is_valid (align))
820 sys_warn (r, _("Invalid variable display parameters. "
821 "Default parameters substituted."));
826 v = dict_get_var (dict, i);
827 var_set_measure (v, measure);
828 var_set_display_width (v, width);
829 var_set_alignment (v, align);
833 /* Reads record type 7, subtype 13, which gives the long name
834 that corresponds to each short name. Modifies variable names
835 in DICT accordingly. */
837 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
838 struct dictionary *dict)
840 struct variable_to_value_map *map;
841 struct variable *var;
845 map = open_variable_to_value_map (r, size * count);
846 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
849 char short_name[SHORT_NAME_LEN + 1];
850 strcpy (short_name, var_get_short_name (var));
852 /* Validate long name. */
853 if (!var_is_valid_name (long_name, false))
855 sys_warn (r, _("Long variable mapping from %s to invalid "
856 "variable name `%s'."),
857 var_get_name (var), long_name);
861 /* Identify any duplicates. */
862 if (strcasecmp (short_name, long_name)
863 && dict_lookup_var (dict, long_name) != NULL)
865 sys_warn (r, _("Duplicate long variable name `%s' "
866 "within system file."), long_name);
870 /* Set long name. Renaming a variable may clear the short
871 name, but we want to retain it, so re-set it
873 dict_rename_var (dict, var, long_name);
874 var_set_short_name (var, short_name);
876 close_variable_to_value_map (r, map);
879 /* Reads record type 7, subtype 14, which gives the real length
880 of each very long string. Rearranges DICT accordingly. */
882 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
883 struct dictionary *dict)
885 struct variable_to_value_map *map;
886 struct variable *var;
892 map = open_variable_to_value_map (r, size * count);
893 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
896 long length, remaining_length;
900 length = strtol (length_s, NULL, 10);
901 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
903 sys_warn (r, _("%s listed as string of length %s "
905 var_get_name (var), length_s);
909 /* Group multiple variables into single variable
910 and delete all but the first. */
911 remaining_length = length;
912 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
913 if (idx < dict_get_var_cnt (dict))
914 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
915 EFFECTIVE_LONG_STRING_LENGTH);
917 sys_error (r, _("Very long string %s overflows dictionary."),
919 dict_delete_consecutive_vars (dict,
920 var_get_dict_index (var) + 1,
921 idx - var_get_dict_index (var) - 1);
923 /* Assign all the length to the first variable. */
924 var_set_width (var, length);
926 close_variable_to_value_map (r, map);
927 dict_compact_values (dict);
930 /* Reads value labels from sysfile H and inserts them into the
931 associated dictionary. */
933 read_value_labels (struct sfm_reader *r,
934 struct dictionary *dict, struct variable **var_by_value_idx)
936 struct pool *subpool;
940 char raw_value[8]; /* Value as uninterpreted bytes. */
941 union value value; /* Value. */
942 char *label; /* Null-terminated label string. */
945 struct label *labels = NULL;
946 int label_cnt; /* Number of labels. */
948 struct variable **var = NULL; /* Associated variables. */
949 int var_cnt; /* Number of associated variables. */
953 subpool = pool_create_subpool (r->pool);
955 /* Read the type 3 record and record its contents. We can't do
956 much with the data yet because we don't know whether it is
957 of numeric or string type. */
959 /* Read number of labels. */
960 label_cnt = read_int32 (r);
962 if (label_cnt >= INT32_MAX / sizeof *labels)
964 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
969 /* Read each value/label tuple into labels[]. */
970 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
971 for (i = 0; i < label_cnt; i++)
973 struct label *label = labels + i;
974 unsigned char label_len;
978 read_bytes (r, label->raw_value, sizeof label->raw_value);
980 /* Read label length. */
981 read_bytes (r, &label_len, sizeof label_len);
982 padded_len = ROUND_UP (label_len + 1, 8);
984 /* Read label, padding. */
985 label->label = pool_alloc (subpool, padded_len + 1);
986 read_bytes (r, label->label, padded_len - 1);
987 label->label[label_len] = 0;
990 /* Now, read the type 4 record that has the list of variables
991 to which the value labels are to be applied. */
993 /* Read record type of type 4 record. */
994 if (read_int32 (r) != 4)
995 sys_error (r, _("Variable index record (type 4) does not immediately "
996 "follow value label record (type 3) as it should."));
998 /* Read number of variables associated with value label from type 4
1000 var_cnt = read_int32 (r);
1001 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1002 sys_error (r, _("Number of variables associated with a value label (%d) "
1003 "is not between 1 and the number of variables (%d)."),
1004 var_cnt, dict_get_var_cnt (dict));
1006 /* Read the list of variables. */
1007 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1008 for (i = 0; i < var_cnt; i++)
1010 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1011 if (var_is_long_string (var[i]))
1012 sys_error (r, _("Value labels are not allowed on long string "
1013 "variables (%s)."), var_get_name (var[i]));
1016 /* Type check the variables. */
1017 for (i = 1; i < var_cnt; i++)
1018 if (var_get_type (var[i]) != var_get_type (var[0]))
1019 sys_error (r, _("Variables associated with value label are not all of "
1020 "identical type. Variable %s is %s, but variable "
1022 var_get_name (var[0]),
1023 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1024 var_get_name (var[i]),
1025 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1027 /* Fill in labels[].value, now that we know the desired type. */
1028 for (i = 0; i < label_cnt; i++)
1030 struct label *label = labels + i;
1032 if (var_is_alpha (var[0]))
1033 buf_copy_rpad (label->value.s, sizeof label->value.s,
1034 label->raw_value, sizeof label->raw_value);
1036 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1039 /* Assign the `value_label's to each variable. */
1040 for (i = 0; i < var_cnt; i++)
1042 struct variable *v = var[i];
1045 /* Add each label to the variable. */
1046 for (j = 0; j < label_cnt; j++)
1048 struct label *label = &labels[j];
1049 if (!var_add_value_label (v, &label->value, label->label))
1051 if (var_is_numeric (var[0]))
1052 sys_warn (r, _("Duplicate value label for %g on %s."),
1053 label->value.f, var_get_name (v));
1055 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1056 var_get_width (v), label->value.s,
1062 pool_destroy (subpool);
1067 static void partial_record (struct sfm_reader *r)
1069 static bool read_case_number (struct sfm_reader *, double *);
1070 static bool read_case_string (struct sfm_reader *, char *, size_t);
1071 static int read_opcode (struct sfm_reader *);
1072 static bool read_compressed_number (struct sfm_reader *, double *);
1073 static bool read_compressed_string (struct sfm_reader *, char *);
1074 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1076 /* Reads one case from READER's file into C. Returns nonzero
1077 only if successful. */
1079 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1084 if (setjmp (r->bail_out))
1087 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1089 /* Fast path. Read the whole case directly. */
1090 if (!try_read_bytes (r, case_data_all_rw (c),
1091 sizeof (union value) * r->value_cnt))
1094 /* Convert floating point numbers to native format if needed. */
1095 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1099 for (i = 0; i < r->var_cnt; i++)
1100 if (r->vars[i].width == 0)
1102 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1103 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1110 /* Slow path. Convert from external to internal format. */
1113 for (i = 0; i < r->var_cnt; i++)
1115 struct sfm_var *sv = &r->vars[i];
1116 union value *v = case_data_rw_idx (c, sv->case_index);
1120 if (!read_case_number (r, &v->f))
1125 /* Read the string data in segments up to 255 bytes
1126 at a time, packed into 8-byte units. */
1127 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1128 int ofs, chunk_size;
1129 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1131 chunk_size = MIN (max_chunk, sv->width - ofs);
1132 if (!read_case_string (r, v->s + ofs, chunk_size))
1140 /* Very long strings have trailing wasted space
1141 that we must skip. */
1142 if (sv->width >= MIN_VERY_LONG_STRING)
1144 int bytes_read = (sv->width / max_chunk * 256
1145 + ROUND_UP (sv->width % max_chunk, 8));
1146 int total_bytes = sfm_width_to_bytes (sv->width);
1147 int excess_bytes = total_bytes - bytes_read;
1149 while (excess_bytes > 0)
1152 size_t chunk = MIN (sizeof buffer, excess_bytes);
1153 if (!read_whole_strings (r, buffer, chunk))
1155 excess_bytes -= chunk;
1169 /* Issues an error that R ends in a partial record. */
1171 partial_record (struct sfm_reader *r)
1173 sys_error (r, _("File ends in partial case."));
1176 /* Reads a number from R and stores its value in *D.
1177 If R is compressed, reads a compressed number;
1178 otherwise, reads a number in the regular way.
1179 Returns true if successful, false if end of file is
1180 reached immediately. */
1182 read_case_number (struct sfm_reader *r, double *d)
1187 if (!try_read_bytes (r, flt64, sizeof flt64))
1189 *d = flt64_to_double (r, flt64);
1193 return read_compressed_number (r, d);
1196 /* Reads LENGTH string bytes from R into S.
1197 Always reads a multiple of 8 bytes; if LENGTH is not a
1198 multiple of 8, then extra bytes are read and discarded without
1200 Reads compressed strings if S is compressed.
1201 Returns true if successful, false if end of file is
1202 reached immediately. */
1204 read_case_string (struct sfm_reader *r, char *s, size_t length)
1206 size_t whole = ROUND_DOWN (length, 8);
1207 size_t partial = length % 8;
1211 if (!read_whole_strings (r, s, whole))
1218 if (!read_whole_strings (r, bounce, sizeof bounce))
1224 memcpy (s + whole, bounce, partial);
1230 /* Reads and returns the next compression opcode from R. */
1232 read_opcode (struct sfm_reader *r)
1234 assert (r->compressed);
1238 if (r->opcode_idx >= sizeof r->opcodes)
1240 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1244 opcode = r->opcodes[r->opcode_idx++];
1251 /* Reads a compressed number from R and stores its value in D.
1252 Returns true if successful, false if end of file is
1253 reached immediately. */
1255 read_compressed_number (struct sfm_reader *r, double *d)
1257 int opcode = read_opcode (r);
1265 *d = read_flt64 (r);
1269 sys_error (r, _("Compressed data is corrupt."));
1276 *d = opcode - r->bias;
1283 /* Reads a compressed 8-byte string segment from R and stores it
1285 Returns true if successful, false if end of file is
1286 reached immediately. */
1288 read_compressed_string (struct sfm_reader *r, char *dst)
1290 switch (read_opcode (r))
1297 read_bytes (r, dst, 8);
1301 memset (dst, ' ', 8);
1305 sys_error (r, _("Compressed data is corrupt."));
1311 /* Reads LENGTH string bytes from R into S.
1312 LENGTH must be a multiple of 8.
1313 Reads compressed strings if S is compressed.
1314 Returns true if successful, false if end of file is
1315 reached immediately. */
1317 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1319 assert (length % 8 == 0);
1321 return try_read_bytes (r, s, length);
1325 for (ofs = 0; ofs < length; ofs += 8)
1326 if (!read_compressed_string (r, s + ofs))
1336 /* Creates and returns a table that can be used for translating a value
1337 index into a case to a "struct variable *" for DICT. Multiple
1338 system file fields reference variables this way.
1340 This table must be created before processing the very long
1341 string extension record, because that record causes some
1342 values to be deleted from the case and the dictionary to be
1344 static struct variable **
1345 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1347 struct variable **var_by_value_idx;
1351 var_by_value_idx = pool_nmalloc (r->pool,
1352 r->value_cnt, sizeof *var_by_value_idx);
1353 for (i = 0; i < dict_get_var_cnt (dict); i++)
1355 struct variable *v = dict_get_var (dict, i);
1356 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1359 var_by_value_idx[value_idx++] = v;
1360 for (j = 1; j < nv; j++)
1361 var_by_value_idx[value_idx++] = NULL;
1363 assert (value_idx == r->value_cnt);
1365 return var_by_value_idx;
1368 /* Returns the "struct variable" corresponding to the given
1369 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1371 static struct variable *
1372 lookup_var_by_value_idx (struct sfm_reader *r,
1373 struct variable **var_by_value_idx, int value_idx)
1375 struct variable *var;
1377 if (value_idx < 1 || value_idx > r->value_cnt)
1378 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1379 value_idx, r->value_cnt);
1381 var = var_by_value_idx[value_idx - 1];
1383 sys_error (r, _("Variable index %d refers to long string "
1390 /* Returns the variable in D with the given SHORT_NAME,
1391 or a null pointer if there is none. */
1392 static struct variable *
1393 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1395 struct variable *var;
1399 /* First try looking up by full name. This often succeeds. */
1400 var = dict_lookup_var (d, short_name);
1401 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1404 /* Iterate through the whole dictionary as a fallback. */
1405 var_cnt = dict_get_var_cnt (d);
1406 for (i = 0; i < var_cnt; i++)
1408 var = dict_get_var (d, i);
1409 if (!strcasecmp (var_get_short_name (var), short_name))
1416 /* Helpers for reading records that contain "variable=value"
1420 struct variable_to_value_map
1422 struct substring buffer; /* Record contents. */
1423 size_t pos; /* Current position in buffer. */
1426 /* Reads SIZE bytes into a "variable=value" map for R,
1427 and returns the map. */
1428 static struct variable_to_value_map *
1429 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1431 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1432 char *buffer = pool_malloc (r->pool, size + 1);
1433 read_bytes (r, buffer, size);
1434 map->buffer = ss_buffer (buffer, size);
1439 /* Closes MAP and frees its storage.
1440 Not really needed, because the pool will free the map anyway,
1441 but can be used to free it earlier. */
1443 close_variable_to_value_map (struct sfm_reader *r,
1444 struct variable_to_value_map *map)
1446 pool_free (r->pool, ss_data (map->buffer));
1449 /* Reads the next variable=value pair from MAP.
1450 Looks up the variable in DICT and stores it into *VAR.
1451 Stores a null-terminated value into *VALUE. */
1453 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1454 struct variable_to_value_map *map,
1455 struct variable **var, char **value,
1458 int max_warnings = 5;
1462 struct substring short_name_ss, value_ss;
1464 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1465 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1468 if (*warning_cnt > max_warnings)
1469 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1470 *warning_cnt - max_warnings);
1474 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1475 ss_buffer ("\t\0", 2));
1477 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1478 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1481 if (++*warning_cnt <= 5)
1482 sys_warn (r, _("Variable map refers to unknown variable %s."),
1483 ss_data (short_name_ss));
1487 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1488 *value = ss_data (value_ss);
1496 /* Displays a corruption message. */
1498 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1503 ds_init_empty (&text);
1504 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1505 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1506 ds_put_vformat (&text, format, args);
1508 m.category = msg_class_to_category (class);
1509 m.severity = msg_class_to_severity (class);
1510 m.where.file_name = NULL;
1511 m.where.line_number = 0;
1512 m.text = ds_cstr (&text);
1517 /* Displays a warning for the current file position. */
1519 sys_warn (struct sfm_reader *r, const char *format, ...)
1523 va_start (args, format);
1524 sys_msg (r, MW, format, args);
1528 /* Displays an error for the current file position,
1529 marks it as in an error state,
1530 and aborts reading it using longjmp. */
1532 sys_error (struct sfm_reader *r, const char *format, ...)
1536 va_start (args, format);
1537 sys_msg (r, ME, format, args);
1541 longjmp (r->bail_out, 1);
1544 /* Reads BYTE_CNT bytes into BUF.
1545 Returns true if exactly BYTE_CNT bytes are successfully read.
1546 Aborts if an I/O error or a partial read occurs.
1547 If EOF_IS_OK, then an immediate end-of-file causes false to be
1548 returned; otherwise, immediate end-of-file causes an abort
1551 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1552 void *buf, size_t byte_cnt)
1554 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1555 if (bytes_read == byte_cnt)
1557 else if (ferror (r->file))
1558 sys_error (r, _("System error: %s."), strerror (errno));
1559 else if (!eof_is_ok || bytes_read != 0)
1560 sys_error (r, _("Unexpected end of file."));
1565 /* Reads BYTE_CNT into BUF.
1566 Aborts upon I/O error or if end-of-file is encountered. */
1568 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1570 read_bytes_internal (r, false, buf, byte_cnt);
1573 /* Reads BYTE_CNT bytes into BUF.
1574 Returns true if exactly BYTE_CNT bytes are successfully read.
1575 Returns false if an immediate end-of-file is encountered.
1576 Aborts if an I/O error or a partial read occurs. */
1578 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1580 return read_bytes_internal (r, true, buf, byte_cnt);
1583 /* Reads a 32-bit signed integer from R and returns its value in
1586 read_int32 (struct sfm_reader *r)
1589 read_bytes (r, int32, sizeof int32);
1590 return int32_to_native (r, int32);
1593 /* Reads a 64-bit floating-point number from R and returns its
1594 value in host format. */
1596 read_flt64 (struct sfm_reader *r)
1599 read_bytes (r, flt64, sizeof flt64);
1600 return flt64_to_double (r, flt64);
1603 /* Reads exactly SIZE - 1 bytes into BUFFER
1604 and stores a null byte into BUFFER[SIZE - 1]. */
1606 read_string (struct sfm_reader *r, char *buffer, size_t size)
1609 read_bytes (r, buffer, size - 1);
1610 buffer[size - 1] = '\0';
1613 /* Skips BYTES bytes forward in R. */
1615 skip_bytes (struct sfm_reader *r, size_t bytes)
1620 size_t chunk = MIN (sizeof buffer, bytes);
1621 read_bytes (r, buffer, chunk);
1626 /* Returns the value of the 32-bit signed integer at INT32,
1627 converted from the format used by R to the host format. */
1629 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1632 if (r->integer_format == INTEGER_NATIVE)
1633 memcpy (&x, int32, sizeof x);
1635 x = integer_get (r->integer_format, int32, sizeof x);
1639 /* Returns the value of the 64-bit floating point number at
1640 FLT64, converted from the format used by R to the host
1643 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1646 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1647 memcpy (&x, flt64, sizeof x);
1649 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);