1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include "sys-file-reader.h"
22 #include "sys-file-private.h"
30 #include <libpspp/alloc.h>
31 #include <libpspp/assertion.h>
32 #include <libpspp/message.h>
33 #include <libpspp/compiler.h>
34 #include <libpspp/magic.h>
35 #include <libpspp/misc.h>
36 #include <libpspp/pool.h>
37 #include <libpspp/str.h>
38 #include <libpspp/hash.h>
39 #include <libpspp/array.h>
42 #include "dictionary.h"
43 #include "file-handle-def.h"
44 #include "file-name.h"
46 #include "missing-values.h"
47 #include "value-labels.h"
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
74 enum integer_format integer_format; /* On-disk integer format. */
75 enum float_format float_format; /* On-disk floating point format. */
76 int value_cnt; /* Number of 8-byte units per case. */
77 struct sfm_var *vars; /* Variables. */
78 size_t var_cnt; /* Number of variables. */
79 bool has_vls; /* File has one or more very long strings? */
82 bool compressed; /* File is compressed? */
83 double bias; /* Compression bias, usually 100.0. */
84 uint8_t opcodes[8]; /* Current block of opcodes. */
85 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
88 /* A variable in a system file. */
91 int width; /* 0=numeric, otherwise string width. */
92 int case_index; /* Index into case. */
95 static struct variable **make_var_by_value_idx (struct sfm_reader *,
97 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
101 static void sys_warn (struct sfm_reader *, const char *, ...)
102 PRINTF_FORMAT (2, 3);
104 static void sys_error (struct sfm_reader *, const char *, ...)
108 static void read_bytes (struct sfm_reader *, void *, size_t);
109 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
110 static int32_t read_int32 (struct sfm_reader *);
111 static double read_flt64 (struct sfm_reader *);
112 static void read_string (struct sfm_reader *, char *, size_t);
113 static void skip_bytes (struct sfm_reader *, size_t);
115 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
116 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
118 static struct variable_to_value_map *open_variable_to_value_map (
119 struct sfm_reader *, size_t size);
120 static void close_variable_to_value_map (struct sfm_reader *r,
121 struct variable_to_value_map *);
122 static bool read_variable_to_value_map (struct sfm_reader *,
124 struct variable_to_value_map *,
125 struct variable **var, char **value,
128 /* Dictionary reader. */
136 static void read_header (struct sfm_reader *, struct dictionary *,
137 int *weight_idx, int *claimed_value_cnt,
138 struct sfm_read_info *);
139 static void read_variable_record (struct sfm_reader *, struct dictionary *,
140 int *format_warning_cnt);
141 static void parse_format_spec (struct sfm_reader *, uint32_t,
142 enum which_format, struct variable *,
143 int *format_warning_cnt);
144 static void setup_weight (struct sfm_reader *, int weight_idx,
145 struct variable **var_by_value_idx,
146 struct dictionary *);
147 static void read_documents (struct sfm_reader *, struct dictionary *);
148 static void read_value_labels (struct sfm_reader *, struct dictionary *,
149 struct variable **var_by_value_idx);
151 static void read_extension_record (struct sfm_reader *, struct dictionary *);
152 static void read_machine_int32_info (struct sfm_reader *,
153 size_t size, size_t count);
154 static void read_machine_flt64_info (struct sfm_reader *,
155 size_t size, size_t count);
156 static void read_display_parameters (struct sfm_reader *,
157 size_t size, size_t count,
158 struct dictionary *);
159 static void read_long_var_name_map (struct sfm_reader *,
160 size_t size, size_t count,
161 struct dictionary *);
162 static void read_long_string_map (struct sfm_reader *,
163 size_t size, size_t count,
164 struct dictionary *);
167 /* Opens the system file designated by file handle FH for
168 reading. Reads the system file's dictionary into *DICT.
169 If INFO is non-null, then it receives additional info about the
172 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
173 struct sfm_read_info *info)
175 struct sfm_reader *volatile r = NULL;
176 struct variable **var_by_value_idx;
177 int format_warning_cnt = 0;
179 int claimed_value_cnt;
183 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
186 *dict = dict_create ();
188 /* Create and initialize reader. */
189 r = pool_create_container (struct sfm_reader, pool);
191 r->file = fn_open (fh_get_file_name (fh), "rb");
195 r->opcode_idx = sizeof r->opcodes;
197 if (setjmp (r->bail_out))
199 sfm_close_reader (r);
200 dict_destroy (*dict);
207 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
208 fh_get_file_name (r->fh), strerror (errno));
209 longjmp (r->bail_out, 1);
213 read_header (r, *dict, &weight_idx, &claimed_value_cnt, info);
215 /* Read all the variable definition records. */
216 rec_type = read_int32 (r);
217 while (rec_type == 2)
219 read_variable_record (r, *dict, &format_warning_cnt);
220 rec_type = read_int32 (r);
223 /* Figure out the case format. */
224 var_by_value_idx = make_var_by_value_idx (r, *dict);
225 setup_weight (r, weight_idx, var_by_value_idx, *dict);
227 /* Read all the rest of the dictionary records. */
228 while (rec_type != 999)
233 read_value_labels (r, *dict, var_by_value_idx);
237 sys_error (r, _("Misplaced type 4 record."));
240 read_documents (r, *dict);
244 read_extension_record (r, *dict);
248 sys_error (r, _("Unrecognized record type %d."), rec_type);
250 rec_type = read_int32 (r);
253 /* Read record 999 data, which is just filler. */
256 if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt)
257 sys_warn (r, _("File header claims %d variable positions but "
258 "%d were read from file."),
259 claimed_value_cnt, r->value_cnt);
261 /* Create an index of dictionary variable widths for
262 sfm_read_case to use. We cannot use the `struct variable's
263 from the dictionary we created, because the caller owns the
264 dictionary and may destroy or modify its variables. */
265 r->var_cnt = dict_get_var_cnt (*dict);
266 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
267 for (i = 0; i < r->var_cnt; i++)
269 struct variable *v = dict_get_var (*dict, i);
270 struct sfm_var *sv = &r->vars[i];
271 sv->width = var_get_width (v);
272 sv->case_index = var_get_case_index (v);
275 pool_free (r->pool, var_by_value_idx);
279 /* Closes a system file after we're done with it. */
281 sfm_close_reader (struct sfm_reader *r)
288 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
289 msg (ME, _("Error closing system file \"%s\": %s."),
290 fh_get_file_name (r->fh), strerror (errno));
295 fh_close (r->fh, "system file", "rs");
297 pool_destroy (r->pool);
300 /* Returns true if an I/O error has occurred on READER, false
303 sfm_read_error (const struct sfm_reader *reader)
305 return reader->error;
308 /* Returns true if FILE is an SPSS system file,
311 sfm_detect (FILE *file)
315 if (fread (rec_type, 4, 1, file) != 1)
319 return !strcmp ("$FL2", rec_type);
322 /* Reads the global header of the system file.
323 Sets DICT's file label to the system file's label.
324 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
325 or to the value index of the weight variable otherwise.
326 Sets *CLAIMED_VALUE_CNT to the number of values that the file
327 claims to have (although it is not always correct).
328 If INFO is non-null, initializes *INFO with header
331 read_header (struct sfm_reader *r, struct dictionary *dict,
332 int *weight_idx, int *claimed_value_cnt,
333 struct sfm_read_info *info)
336 char eye_catcher[61];
337 uint8_t raw_layout_code[4];
340 char creation_date[10];
341 char creation_time[9];
343 struct substring file_label_ss;
345 read_string (r, rec_type, sizeof rec_type);
346 read_string (r, eye_catcher, sizeof eye_catcher);
348 if (strcmp ("$FL2", rec_type) != 0)
349 sys_error (r, _("This is not an SPSS system file."));
351 /* Identify integer format. */
352 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
353 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
355 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
357 || (r->integer_format != INTEGER_MSB_FIRST
358 && r->integer_format != INTEGER_LSB_FIRST))
359 sys_error (r, _("This is not an SPSS system file."));
361 *claimed_value_cnt = read_int32 (r);
362 if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16)
363 *claimed_value_cnt = -1;
365 r->compressed = read_int32 (r) != 0;
367 *weight_idx = read_int32 (r);
369 case_cnt = read_int32 (r);
370 if (case_cnt < -1 || case_cnt > INT_MAX / 2)
373 /* Identify floating-point format and obtain compression bias. */
374 read_bytes (r, raw_bias, sizeof raw_bias);
375 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
377 sys_warn (r, _("Compression bias (%g) is not the usual "
378 "value of 100, or system file uses unrecognized "
379 "floating-point format."),
381 if (r->integer_format == INTEGER_MSB_FIRST)
382 r->float_format = FLOAT_IEEE_DOUBLE_BE;
384 r->float_format = FLOAT_IEEE_DOUBLE_LE;
386 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
388 read_string (r, creation_date, sizeof creation_date);
389 read_string (r, creation_time, sizeof creation_time);
390 read_string (r, file_label, sizeof file_label);
393 file_label_ss = ss_cstr (file_label);
394 ss_trim (&file_label_ss, ss_cstr (" "));
395 if (!ss_is_empty (file_label_ss))
397 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
398 dict_set_label (dict, ss_data (file_label_ss));
403 struct substring product;
405 strcpy (info->creation_date, creation_date);
406 strcpy (info->creation_time, creation_time);
407 info->integer_format = r->integer_format;
408 info->float_format = r->float_format;
409 info->compressed = r->compressed;
410 info->case_cnt = case_cnt;
412 product = ss_cstr (eye_catcher);
413 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
414 ss_trim (&product, ss_cstr (" "));
415 str_copy_buf_trunc (info->product, sizeof info->product,
416 ss_data (product), ss_length (product));
420 /* Reads a variable (type 2) record from R and adds the
421 corresponding variable to DICT.
422 Also skips past additional variable records for long string
425 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
426 int *format_warning_cnt)
429 int has_variable_label;
430 int missing_value_code;
435 struct variable *var;
438 width = read_int32 (r);
439 has_variable_label = read_int32 (r);
440 missing_value_code = read_int32 (r);
441 print_format = read_int32 (r);
442 write_format = read_int32 (r);
443 read_string (r, name, sizeof name);
444 name[strcspn (name, " ")] = '\0';
446 /* Check variable name. */
447 if (name[0] == '$' || name[0] == '#')
448 sys_error (r, "Variable name begins with invalid character `%c'.",
450 if (!var_is_plausible_name (name, false))
451 sys_error (r, _("Invalid variable name `%s'."), name);
453 /* Create variable. */
454 if (width < 0 || width > 255)
455 sys_error (r, _("Bad variable width %d."), width);
456 var = dict_create_var (dict, name, width);
459 _("Duplicate variable name `%s' within system file."),
462 /* Set the short name the same as the long name */
463 var_set_short_name (var, var_get_name (var));
465 /* Get variable label, if any. */
466 if (has_variable_label != 0 && has_variable_label != 1)
467 sys_error (r, _("Variable label indicator field is not 0 or 1."));
468 if (has_variable_label == 1)
473 len = read_int32 (r);
474 if (len >= sizeof label)
475 sys_error (r, _("Variable %s has label of invalid length %u."),
476 name, (unsigned int) len);
477 read_string (r, label, len + 1);
478 var_set_label (var, label);
480 skip_bytes (r, ROUND_UP (len, 4) - len);
483 /* Set missing values. */
484 if (missing_value_code < -3 || missing_value_code > 3
485 || missing_value_code == -1)
486 sys_error (r, _("Missing value indicator field is not "
487 "-3, -2, 0, 1, 2, or 3."));
488 if (missing_value_code != 0)
490 struct missing_values mv;
491 mv_init (&mv, var_get_width (var));
492 if (var_is_numeric (var))
494 if (missing_value_code > 0)
497 for (i = 0; i < missing_value_code; i++)
498 mv_add_num (&mv, read_flt64 (r));
502 double low = read_flt64 (r);
503 double high = read_flt64 (r);
504 mv_add_num_range (&mv, low, high);
505 if (missing_value_code == -3)
506 mv_add_num (&mv, read_flt64 (r));
509 else if (var_get_width (var) <= MAX_SHORT_STRING)
511 if (missing_value_code > 0)
514 for (i = 0; i < missing_value_code; i++)
517 read_string (r, string, sizeof string);
518 mv_add_str (&mv, string);
522 sys_error (r, _("String variable %s may not have missing "
523 "values specified as a range."),
526 else /* var->width > MAX_SHORT_STRING */
527 sys_error (r, _("Long string variable %s may not have missing "
530 var_set_missing_values (var, &mv);
534 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
535 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
537 /* Account for values.
538 Skip long string continuation records, if any. */
539 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
545 for (i = 1; i < nv; i++)
547 /* Check for record type 2 and width -1. */
548 if (read_int32 (r) != 2 || read_int32 (r) != -1)
549 sys_error (r, _("Missing string continuation record."));
551 /* Skip and ignore remaining continuation data. */
552 has_variable_label = read_int32 (r);
553 missing_value_code = read_int32 (r);
554 print_format = read_int32 (r);
555 write_format = read_int32 (r);
556 read_string (r, name, sizeof name);
558 /* Variable label fields on continuation records have
559 been spotted in system files created by "SPSS Power
560 Macintosh Release 6.1". */
561 if (has_variable_label)
562 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
567 /* Translates the format spec from sysfile format to internal
570 parse_format_spec (struct sfm_reader *r, uint32_t s,
571 enum which_format which, struct variable *v,
572 int *format_warning_cnt)
574 const int max_format_warnings = 8;
576 uint8_t raw_type = s >> 16;
582 if (!fmt_from_io (raw_type, &f.type))
583 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
588 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
593 if (which == PRINT_FORMAT)
594 var_set_print_format (v, &f);
596 var_set_write_format (v, &f);
598 else if (*++format_warning_cnt <= max_format_warnings)
600 char fmt_string[FMT_STRING_LEN_MAX + 1];
601 sys_warn (r, _("%s variable %s has invalid %s format %s."),
602 var_is_numeric (v) ? _("Numeric") : _("String"),
604 which == PRINT_FORMAT ? _("print") : _("write"),
605 fmt_to_string (&f, fmt_string));
607 if (*format_warning_cnt == max_format_warnings)
608 sys_warn (r, _("Suppressing further invalid format warnings."));
612 /* Sets the weighting variable in DICT to the variable
613 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
616 setup_weight (struct sfm_reader *r, int weight_idx,
617 struct variable **var_by_value_idx, struct dictionary *dict)
621 struct variable *weight_var
622 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
623 if (var_is_numeric (weight_var))
624 dict_set_weight (dict, weight_var);
626 sys_error (r, _("Weighting variable must be numeric."));
630 /* Reads a document record, type 6, from system file R, and sets up
631 the documents and n_documents fields in the associated
634 read_documents (struct sfm_reader *r, struct dictionary *dict)
639 if (dict_get_documents (dict) != NULL)
640 sys_error (r, _("Multiple type 6 (document) records."));
642 line_cnt = read_int32 (r);
644 sys_error (r, _("Number of document lines (%d) "
645 "must be greater than 0."), line_cnt);
647 documents = pool_nmalloc (r->pool, line_cnt + 1, 80);
648 read_string (r, documents, 80 * line_cnt + 1);
649 dict_set_documents (dict, documents);
650 pool_free (r->pool, documents);
653 /* Read a type 7 extension record. */
655 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
657 int subtype = read_int32 (r);
658 size_t size = read_int32 (r);
659 size_t count = read_int32 (r);
660 size_t bytes = size * count;
662 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
663 allows an extra byte for a null terminator, used by some
664 extension processing routines. */
665 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
666 sys_error (r, "Record type 7 subtype %d too large.", subtype);
671 read_machine_int32_info (r, size, count);
675 read_machine_flt64_info (r, size, count);
679 /* Variable sets information. We don't use these yet.
680 They only apply to GUIs; see VARSETS on the APPLY
681 DICTIONARY command in SPSS documentation. */
685 /* DATE variable information. We don't use it yet, but we
690 /* Unknown purpose. */
694 read_display_parameters (r, size, count, dict);
698 read_long_var_name_map (r, size, count, dict);
702 read_long_string_map (r, size, count, dict);
706 /* New in SPSS v14? Unknown purpose. */
710 /* Text field that defines variable attributes. New in
715 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
719 skip_bytes (r, bytes);
722 /* Read record type 7, subtype 3. */
724 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
726 int version_major UNUSED = read_int32 (r);
727 int version_minor UNUSED = read_int32 (r);
728 int version_revision UNUSED = read_int32 (r);
729 int machine_code UNUSED = read_int32 (r);
730 int float_representation = read_int32 (r);
731 int compression_code UNUSED = read_int32 (r);
732 int integer_representation = read_int32 (r);
733 int character_code UNUSED = read_int32 (r);
735 int expected_float_format;
736 int expected_integer_format;
738 if (size != 4 || count != 8)
739 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
741 (unsigned int) size, (unsigned int) count);
743 /* Check floating point format. */
744 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
745 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
746 expected_float_format = 1;
747 else if (r->float_format == FLOAT_Z_LONG)
748 expected_float_format = 2;
749 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
750 expected_float_format = 3;
753 if (float_representation != expected_float_format)
754 sys_error (r, _("Floating-point representation indicated by "
755 "system file (%d) differs from expected (%d)."),
756 r->float_format, expected_float_format);
758 /* Check integer format. */
759 if (r->integer_format == INTEGER_MSB_FIRST)
760 expected_integer_format = 1;
761 else if (r->integer_format == INTEGER_LSB_FIRST)
762 expected_integer_format = 2;
765 if (integer_representation != expected_integer_format)
767 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
768 sys_warn (r, _("Integer format indicated by system file (%s) "
769 "differs from expected (%s)."),
770 gettext (endian[integer_representation == 1]),
771 gettext (endian[expected_integer_format == 1]));
775 /* Read record type 7, subtype 4. */
777 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
779 double sysmis = read_flt64 (r);
780 double highest = read_flt64 (r);
781 double lowest = read_flt64 (r);
783 if (size != 8 || count != 3)
784 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
785 (unsigned int) size, (unsigned int) count);
787 if (sysmis != SYSMIS)
788 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
789 if (highest != HIGHEST)
790 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
791 if (lowest != LOWEST)
792 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
795 /* Read record type 7, subtype 11, which specifies how variables
796 should be displayed in GUI environments. */
798 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
799 struct dictionary *dict)
801 const size_t n_vars = count / 3 ;
805 if (count % 3 || n_vars != dict_get_var_cnt (dict))
806 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
807 (unsigned int) size, (unsigned int) count);
809 for (i = 0; i < n_vars; ++i)
811 int measure = read_int32 (r);
812 int width = read_int32 (r);
813 int align = read_int32 (r);
814 struct variable *v = dict_get_var (dict, i);
816 /* spss v14 sometimes seems to set string variables' measure to zero */
817 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
820 if (measure < 1 || measure > 3 || align < 0 || align > 2)
823 sys_warn (r, _("Invalid variable display parameters. "
824 "Default parameters substituted."));
829 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
830 : measure == 2 ? MEASURE_ORDINAL
832 var_set_display_width (v, width);
833 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
834 : align == 1 ? ALIGN_RIGHT
839 /* Reads record type 7, subtype 13, which gives the long name
840 that corresponds to each short name. Modifies variable names
841 in DICT accordingly. */
843 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
844 struct dictionary *dict)
846 struct variable_to_value_map *map;
847 struct variable *var;
851 map = open_variable_to_value_map (r, size * count);
852 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
855 char short_name[SHORT_NAME_LEN + 1];
856 strcpy (short_name, var_get_short_name (var));
858 /* Validate long name. */
859 if (!var_is_valid_name (long_name, false))
861 sys_warn (r, _("Long variable mapping from %s to invalid "
862 "variable name `%s'."),
863 var_get_name (var), long_name);
867 /* Identify any duplicates. */
868 if (strcasecmp (short_name, long_name)
869 && dict_lookup_var (dict, long_name) != NULL)
871 sys_warn (r, _("Duplicate long variable name `%s' "
872 "within system file."), long_name);
876 /* Set long name. Renaming a variable may clear the short
877 name, but we want to retain it, so re-set it
879 dict_rename_var (dict, var, long_name);
880 var_set_short_name (var, short_name);
882 close_variable_to_value_map (r, map);
885 /* Reads record type 7, subtype 14, which gives the real length
886 of each very long string. Rearranges DICT accordingly. */
888 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
889 struct dictionary *dict)
891 struct variable_to_value_map *map;
892 struct variable *var;
898 map = open_variable_to_value_map (r, size * count);
899 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
902 long length, remaining_length;
906 length = strtol (length_s, NULL, 10);
907 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
909 sys_warn (r, _("%s listed as string of length %s "
911 var_get_name (var), length_s);
915 /* Group multiple variables into single variable
916 and delete all but the first. */
917 remaining_length = length;
918 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
919 if (idx < dict_get_var_cnt (dict))
920 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
921 EFFECTIVE_LONG_STRING_LENGTH);
923 sys_error (r, _("Very long string %s overflows dictionary."),
925 dict_delete_consecutive_vars (dict,
926 var_get_dict_index (var) + 1,
927 idx - var_get_dict_index (var) - 1);
929 /* Assign all the length to the first variable. */
930 var_set_width (var, length);
932 close_variable_to_value_map (r, map);
933 dict_compact_values (dict);
936 /* Reads value labels from sysfile H and inserts them into the
937 associated dictionary. */
939 read_value_labels (struct sfm_reader *r,
940 struct dictionary *dict, struct variable **var_by_value_idx)
942 struct pool *subpool;
946 char raw_value[8]; /* Value as uninterpreted bytes. */
947 union value value; /* Value. */
948 char *label; /* Null-terminated label string. */
951 struct label *labels = NULL;
952 int label_cnt; /* Number of labels. */
954 struct variable **var = NULL; /* Associated variables. */
955 int var_cnt; /* Number of associated variables. */
959 subpool = pool_create_subpool (r->pool);
961 /* Read the type 3 record and record its contents. We can't do
962 much with the data yet because we don't know whether it is
963 of numeric or string type. */
965 /* Read number of labels. */
966 label_cnt = read_int32 (r);
968 if (label_cnt >= INT32_MAX / sizeof *labels)
970 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
975 /* Read each value/label tuple into labels[]. */
976 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
977 for (i = 0; i < label_cnt; i++)
979 struct label *label = labels + i;
980 unsigned char label_len;
984 read_bytes (r, label->raw_value, sizeof label->raw_value);
986 /* Read label length. */
987 read_bytes (r, &label_len, sizeof label_len);
988 padded_len = ROUND_UP (label_len + 1, 8);
990 /* Read label, padding. */
991 label->label = pool_alloc (subpool, padded_len + 1);
992 read_bytes (r, label->label, padded_len - 1);
993 label->label[label_len] = 0;
996 /* Now, read the type 4 record that has the list of variables
997 to which the value labels are to be applied. */
999 /* Read record type of type 4 record. */
1000 if (read_int32 (r) != 4)
1001 sys_error (r, _("Variable index record (type 4) does not immediately "
1002 "follow value label record (type 3) as it should."));
1004 /* Read number of variables associated with value label from type 4
1006 var_cnt = read_int32 (r);
1007 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1008 sys_error (r, _("Number of variables associated with a value label (%d) "
1009 "is not between 1 and the number of variables (%u)."),
1010 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1012 /* Read the list of variables. */
1013 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1014 for (i = 0; i < var_cnt; i++)
1016 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1017 if (var_is_long_string (var[i]))
1018 sys_error (r, _("Value labels are not allowed on long string "
1019 "variables (%s)."), var_get_name (var[i]));
1022 /* Type check the variables. */
1023 for (i = 1; i < var_cnt; i++)
1024 if (var_get_type (var[i]) != var_get_type (var[0]))
1025 sys_error (r, _("Variables associated with value label are not all of "
1026 "identical type. Variable %s is %s, but variable "
1028 var_get_name (var[0]),
1029 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1030 var_get_name (var[i]),
1031 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1033 /* Fill in labels[].value, now that we know the desired type. */
1034 for (i = 0; i < label_cnt; i++)
1036 struct label *label = labels + i;
1038 if (var_is_alpha (var[0]))
1039 buf_copy_rpad (label->value.s, sizeof label->value.s,
1040 label->raw_value, sizeof label->raw_value);
1042 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1045 /* Assign the `value_label's to each variable. */
1046 for (i = 0; i < var_cnt; i++)
1048 struct variable *v = var[i];
1051 /* Add each label to the variable. */
1052 for (j = 0; j < label_cnt; j++)
1054 struct label *label = &labels[j];
1055 if (!var_add_value_label (v, &label->value, label->label))
1057 if (var_is_numeric (var[0]))
1058 sys_warn (r, _("Duplicate value label for %g on %s."),
1059 label->value.f, var_get_name (v));
1061 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1062 var_get_width (v), label->value.s,
1068 pool_destroy (subpool);
1073 static void partial_record (struct sfm_reader *r)
1075 static bool read_case_number (struct sfm_reader *, double *);
1076 static bool read_case_string (struct sfm_reader *, char *, size_t);
1077 static int read_opcode (struct sfm_reader *);
1078 static bool read_compressed_number (struct sfm_reader *, double *);
1079 static bool read_compressed_string (struct sfm_reader *, char *);
1080 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1082 /* Reads one case from READER's file into C. Returns nonzero
1083 only if successful. */
1085 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1090 if (setjmp (r->bail_out))
1093 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1095 /* Fast path. Read the whole case directly. */
1096 if (!try_read_bytes (r, case_data_all_rw (c),
1097 sizeof (union value) * r->value_cnt))
1100 /* Convert floating point numbers to native format if needed. */
1101 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1105 for (i = 0; i < r->var_cnt; i++)
1106 if (r->vars[i].width == 0)
1108 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1109 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1116 /* Slow path. Convert from external to internal format. */
1119 for (i = 0; i < r->var_cnt; i++)
1121 struct sfm_var *sv = &r->vars[i];
1122 union value *v = case_data_rw_idx (c, sv->case_index);
1126 if (!read_case_number (r, &v->f))
1131 /* Read the string data in segments up to 255 bytes
1132 at a time, packed into 8-byte units. */
1133 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1134 int ofs, chunk_size;
1135 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1137 chunk_size = MIN (max_chunk, sv->width - ofs);
1138 if (!read_case_string (r, v->s + ofs, chunk_size))
1146 /* Very long strings have trailing wasted space
1147 that we must skip. */
1148 if (sv->width >= MIN_VERY_LONG_STRING)
1150 int bytes_read = (sv->width / max_chunk * 256
1151 + ROUND_UP (sv->width % max_chunk, 8));
1152 int total_bytes = sfm_width_to_bytes (sv->width);
1153 int excess_bytes = total_bytes - bytes_read;
1155 while (excess_bytes > 0)
1158 size_t chunk = MIN (sizeof buffer, excess_bytes);
1159 if (!read_whole_strings (r, buffer, chunk))
1161 excess_bytes -= chunk;
1175 /* Issues an error that R ends in a partial record. */
1177 partial_record (struct sfm_reader *r)
1179 sys_error (r, _("File ends in partial case."));
1182 /* Reads a number from R and stores its value in *D.
1183 If R is compressed, reads a compressed number;
1184 otherwise, reads a number in the regular way.
1185 Returns true if successful, false if end of file is
1186 reached immediately. */
1188 read_case_number (struct sfm_reader *r, double *d)
1193 if (!try_read_bytes (r, flt64, sizeof flt64))
1195 *d = flt64_to_double (r, flt64);
1199 return read_compressed_number (r, d);
1202 /* Reads LENGTH string bytes from R into S.
1203 Always reads a multiple of 8 bytes; if LENGTH is not a
1204 multiple of 8, then extra bytes are read and discarded without
1206 Reads compressed strings if S is compressed.
1207 Returns true if successful, false if end of file is
1208 reached immediately. */
1210 read_case_string (struct sfm_reader *r, char *s, size_t length)
1212 size_t whole = ROUND_DOWN (length, 8);
1213 size_t partial = length % 8;
1217 if (!read_whole_strings (r, s, whole))
1224 if (!read_whole_strings (r, bounce, sizeof bounce))
1230 memcpy (s + whole, bounce, partial);
1236 /* Reads and returns the next compression opcode from R. */
1238 read_opcode (struct sfm_reader *r)
1240 assert (r->compressed);
1244 if (r->opcode_idx >= sizeof r->opcodes)
1246 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1250 opcode = r->opcodes[r->opcode_idx++];
1257 /* Reads a compressed number from R and stores its value in D.
1258 Returns true if successful, false if end of file is
1259 reached immediately. */
1261 read_compressed_number (struct sfm_reader *r, double *d)
1263 int opcode = read_opcode (r);
1271 *d = read_flt64 (r);
1275 sys_error (r, _("Compressed data is corrupt."));
1282 *d = opcode - r->bias;
1289 /* Reads a compressed 8-byte string segment from R and stores it
1291 Returns true if successful, false if end of file is
1292 reached immediately. */
1294 read_compressed_string (struct sfm_reader *r, char *dst)
1296 switch (read_opcode (r))
1303 read_bytes (r, dst, 8);
1307 memset (dst, ' ', 8);
1311 sys_error (r, _("Compressed data is corrupt."));
1317 /* Reads LENGTH string bytes from R into S.
1318 LENGTH must be a multiple of 8.
1319 Reads compressed strings if S is compressed.
1320 Returns true if successful, false if end of file is
1321 reached immediately. */
1323 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1325 assert (length % 8 == 0);
1327 return try_read_bytes (r, s, length);
1331 for (ofs = 0; ofs < length; ofs += 8)
1332 if (!read_compressed_string (r, s + ofs))
1342 /* Creates and returns a table that can be used for translating a value
1343 index into a case to a "struct variable *" for DICT. Multiple
1344 system file fields reference variables this way.
1346 This table must be created before processing the very long
1347 string extension record, because that record causes some
1348 values to be deleted from the case and the dictionary to be
1350 static struct variable **
1351 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1353 struct variable **var_by_value_idx;
1357 var_by_value_idx = pool_nmalloc (r->pool,
1358 r->value_cnt, sizeof *var_by_value_idx);
1359 for (i = 0; i < dict_get_var_cnt (dict); i++)
1361 struct variable *v = dict_get_var (dict, i);
1362 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1365 var_by_value_idx[value_idx++] = v;
1366 for (j = 1; j < nv; j++)
1367 var_by_value_idx[value_idx++] = NULL;
1369 assert (value_idx == r->value_cnt);
1371 return var_by_value_idx;
1374 /* Returns the "struct variable" corresponding to the given
1375 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1377 static struct variable *
1378 lookup_var_by_value_idx (struct sfm_reader *r,
1379 struct variable **var_by_value_idx, int value_idx)
1381 struct variable *var;
1383 if (value_idx < 1 || value_idx > r->value_cnt)
1384 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1385 value_idx, r->value_cnt);
1387 var = var_by_value_idx[value_idx - 1];
1389 sys_error (r, _("Variable index %d refers to long string "
1396 /* Returns the variable in D with the given SHORT_NAME,
1397 or a null pointer if there is none. */
1398 static struct variable *
1399 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1401 struct variable *var;
1405 /* First try looking up by full name. This often succeeds. */
1406 var = dict_lookup_var (d, short_name);
1407 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1410 /* Iterate through the whole dictionary as a fallback. */
1411 var_cnt = dict_get_var_cnt (d);
1412 for (i = 0; i < var_cnt; i++)
1414 var = dict_get_var (d, i);
1415 if (!strcasecmp (var_get_short_name (var), short_name))
1422 /* Helpers for reading records that contain "variable=value"
1426 struct variable_to_value_map
1428 struct substring buffer; /* Record contents. */
1429 size_t pos; /* Current position in buffer. */
1432 /* Reads SIZE bytes into a "variable=value" map for R,
1433 and returns the map. */
1434 static struct variable_to_value_map *
1435 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1437 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1438 char *buffer = pool_malloc (r->pool, size + 1);
1439 read_bytes (r, buffer, size);
1440 map->buffer = ss_buffer (buffer, size);
1445 /* Closes MAP and frees its storage.
1446 Not really needed, because the pool will free the map anyway,
1447 but can be used to free it earlier. */
1449 close_variable_to_value_map (struct sfm_reader *r,
1450 struct variable_to_value_map *map)
1452 pool_free (r->pool, ss_data (map->buffer));
1455 /* Reads the next variable=value pair from MAP.
1456 Looks up the variable in DICT and stores it into *VAR.
1457 Stores a null-terminated value into *VALUE. */
1459 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1460 struct variable_to_value_map *map,
1461 struct variable **var, char **value,
1464 int max_warnings = 5;
1468 struct substring short_name_ss, value_ss;
1470 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1471 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1474 if (*warning_cnt > max_warnings)
1475 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1476 *warning_cnt - max_warnings);
1480 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1481 ss_buffer ("\t\0", 2));
1483 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1484 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1487 if (++*warning_cnt <= 5)
1488 sys_warn (r, _("Variable map refers to unknown variable %s."),
1489 ss_data (short_name_ss));
1493 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1494 *value = ss_data (value_ss);
1502 /* Displays a corruption message. */
1504 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1509 ds_init_empty (&text);
1510 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1511 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1512 ds_put_vformat (&text, format, args);
1514 m.category = msg_class_to_category (class);
1515 m.severity = msg_class_to_severity (class);
1516 m.where.file_name = NULL;
1517 m.where.line_number = 0;
1518 m.text = ds_cstr (&text);
1523 /* Displays a warning for the current file position. */
1525 sys_warn (struct sfm_reader *r, const char *format, ...)
1529 va_start (args, format);
1530 sys_msg (r, MW, format, args);
1534 /* Displays an error for the current file position,
1535 marks it as in an error state,
1536 and aborts reading it using longjmp. */
1538 sys_error (struct sfm_reader *r, const char *format, ...)
1542 va_start (args, format);
1543 sys_msg (r, ME, format, args);
1547 longjmp (r->bail_out, 1);
1550 /* Reads BYTE_CNT bytes into BUF.
1551 Returns true if exactly BYTE_CNT bytes are successfully read.
1552 Aborts if an I/O error or a partial read occurs.
1553 If EOF_IS_OK, then an immediate end-of-file causes false to be
1554 returned; otherwise, immediate end-of-file causes an abort
1557 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1558 void *buf, size_t byte_cnt)
1560 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1561 if (bytes_read == byte_cnt)
1563 else if (ferror (r->file))
1564 sys_error (r, _("System error: %s."), strerror (errno));
1565 else if (!eof_is_ok || bytes_read != 0)
1566 sys_error (r, _("Unexpected end of file."));
1571 /* Reads BYTE_CNT into BUF.
1572 Aborts upon I/O error or if end-of-file is encountered. */
1574 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1576 read_bytes_internal (r, false, buf, byte_cnt);
1579 /* Reads BYTE_CNT bytes into BUF.
1580 Returns true if exactly BYTE_CNT bytes are successfully read.
1581 Returns false if an immediate end-of-file is encountered.
1582 Aborts if an I/O error or a partial read occurs. */
1584 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1586 return read_bytes_internal (r, true, buf, byte_cnt);
1589 /* Reads a 32-bit signed integer from R and returns its value in
1592 read_int32 (struct sfm_reader *r)
1595 read_bytes (r, int32, sizeof int32);
1596 return int32_to_native (r, int32);
1599 /* Reads a 64-bit floating-point number from R and returns its
1600 value in host format. */
1602 read_flt64 (struct sfm_reader *r)
1605 read_bytes (r, flt64, sizeof flt64);
1606 return flt64_to_double (r, flt64);
1609 /* Reads exactly SIZE - 1 bytes into BUFFER
1610 and stores a null byte into BUFFER[SIZE - 1]. */
1612 read_string (struct sfm_reader *r, char *buffer, size_t size)
1615 read_bytes (r, buffer, size - 1);
1616 buffer[size - 1] = '\0';
1619 /* Skips BYTES bytes forward in R. */
1621 skip_bytes (struct sfm_reader *r, size_t bytes)
1626 size_t chunk = MIN (sizeof buffer, bytes);
1627 read_bytes (r, buffer, chunk);
1632 /* Returns the value of the 32-bit signed integer at INT32,
1633 converted from the format used by R to the host format. */
1635 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1638 if (r->integer_format == INTEGER_NATIVE)
1639 memcpy (&x, int32, sizeof x);
1641 x = integer_get (r->integer_format, int32, sizeof x);
1645 /* Returns the value of the 64-bit floating point number at
1646 FLT64, converted from the format used by R to the host
1649 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1652 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1653 memcpy (&x, flt64, sizeof x);
1655 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);