1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/alloc.h>
29 #include <libpspp/assertion.h>
30 #include <libpspp/message.h>
31 #include <libpspp/compiler.h>
32 #include <libpspp/magic.h>
33 #include <libpspp/misc.h>
34 #include <libpspp/pool.h>
35 #include <libpspp/str.h>
36 #include <libpspp/hash.h>
37 #include <libpspp/array.h>
39 #include <data/case.h>
40 #include <data/casereader-provider.h>
41 #include <data/casereader.h>
42 #include <data/dictionary.h>
43 #include <data/file-handle-def.h>
44 #include <data/file-name.h>
45 #include <data/format.h>
46 #include <data/missing-values.h>
47 #include <data/value-labels.h>
48 #include <data/variable.h>
49 #include <data/value.h>
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
72 size_t value_cnt; /* Number of "union value"s in struct case. */
75 enum integer_format integer_format; /* On-disk integer format. */
76 enum float_format float_format; /* On-disk floating point format. */
77 int flt64_cnt; /* Number of 8-byte units per case. */
78 struct sfm_var *vars; /* Variables. */
79 size_t var_cnt; /* Number of variables. */
80 int32_t case_cnt; /* Number of cases */
81 bool has_long_var_names; /* File has a long variable name map */
82 bool has_vls; /* File has one or more very long strings? */
85 bool compressed; /* File is compressed? */
86 double bias; /* Compression bias, usually 100.0. */
87 uint8_t opcodes[8]; /* Current block of opcodes. */
88 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 /* A variable in a system file. */
94 int width; /* 0=numeric, otherwise string width. */
95 int case_index; /* Index into case. */
98 static struct casereader_class sys_file_casereader_class;
100 static bool close_reader (struct sfm_reader *);
102 static struct variable **make_var_by_value_idx (struct sfm_reader *,
103 struct dictionary *);
104 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
108 static void sys_warn (struct sfm_reader *, const char *, ...)
109 PRINTF_FORMAT (2, 3);
111 static void sys_error (struct sfm_reader *, const char *, ...)
115 static void read_bytes (struct sfm_reader *, void *, size_t);
116 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
117 static int32_t read_int32 (struct sfm_reader *);
118 static double read_flt64 (struct sfm_reader *);
119 static void read_string (struct sfm_reader *, char *, size_t);
120 static void skip_bytes (struct sfm_reader *, size_t);
122 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
123 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
125 static struct variable_to_value_map *open_variable_to_value_map (
126 struct sfm_reader *, size_t size);
127 static void close_variable_to_value_map (struct sfm_reader *r,
128 struct variable_to_value_map *);
129 static bool read_variable_to_value_map (struct sfm_reader *,
131 struct variable_to_value_map *,
132 struct variable **var, char **value,
135 static bool close_reader (struct sfm_reader *r);
137 /* Dictionary reader. */
145 static void read_header (struct sfm_reader *, struct dictionary *,
146 int *weight_idx, int *claimed_flt64_cnt,
147 struct sfm_read_info *);
148 static void read_variable_record (struct sfm_reader *, struct dictionary *,
149 int *format_warning_cnt);
150 static void parse_format_spec (struct sfm_reader *, uint32_t,
151 enum which_format, struct variable *,
152 int *format_warning_cnt);
153 static void setup_weight (struct sfm_reader *, int weight_idx,
154 struct variable **var_by_value_idx,
155 struct dictionary *);
156 static void read_documents (struct sfm_reader *, struct dictionary *);
157 static void read_value_labels (struct sfm_reader *, struct dictionary *,
158 struct variable **var_by_value_idx);
160 static void read_extension_record (struct sfm_reader *, struct dictionary *);
161 static void read_machine_int32_info (struct sfm_reader *,
162 size_t size, size_t count);
163 static void read_machine_flt64_info (struct sfm_reader *,
164 size_t size, size_t count);
165 static void read_display_parameters (struct sfm_reader *,
166 size_t size, size_t count,
167 struct dictionary *);
168 static void read_long_var_name_map (struct sfm_reader *,
169 size_t size, size_t count,
170 struct dictionary *);
171 static void read_long_string_map (struct sfm_reader *,
172 size_t size, size_t count,
173 struct dictionary *);
176 /* Opens the system file designated by file handle FH for
177 reading. Reads the system file's dictionary into *DICT.
178 If INFO is non-null, then it receives additional info about the
181 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
182 struct sfm_read_info *info)
184 struct sfm_reader *volatile r = NULL;
185 struct variable **var_by_value_idx;
186 int format_warning_cnt = 0;
188 int claimed_flt64_cnt;
192 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
195 *dict = dict_create ();
197 /* Create and initialize reader. */
198 r = pool_create_container (struct sfm_reader, pool);
200 r->file = fn_open (fh_get_file_name (fh), "rb");
204 r->has_long_var_names = false;
205 r->opcode_idx = sizeof r->opcodes;
207 if (setjmp (r->bail_out))
210 dict_destroy (*dict);
217 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
218 fh_get_file_name (r->fh), strerror (errno));
219 longjmp (r->bail_out, 1);
223 read_header (r, *dict, &weight_idx, &claimed_flt64_cnt, info);
225 /* Read all the variable definition records. */
226 rec_type = read_int32 (r);
227 while (rec_type == 2)
229 read_variable_record (r, *dict, &format_warning_cnt);
230 rec_type = read_int32 (r);
233 /* Figure out the case format. */
234 var_by_value_idx = make_var_by_value_idx (r, *dict);
235 setup_weight (r, weight_idx, var_by_value_idx, *dict);
237 /* Read all the rest of the dictionary records. */
238 while (rec_type != 999)
243 read_value_labels (r, *dict, var_by_value_idx);
247 sys_error (r, _("Misplaced type 4 record."));
250 read_documents (r, *dict);
254 read_extension_record (r, *dict);
258 sys_error (r, _("Unrecognized record type %d."), rec_type);
260 rec_type = read_int32 (r);
264 if ( ! r->has_long_var_names )
267 for (i = 0; i < dict_get_var_cnt (*dict); i++)
269 struct variable *var = dict_get_var (*dict, i);
270 char short_name [SHORT_NAME_LEN + 1];
271 char long_name [SHORT_NAME_LEN + 1];
273 strcpy (short_name, var_get_name (var));
275 strcpy (long_name, short_name);
276 str_lowercase (long_name);
278 /* Set long name. Renaming a variable may clear the short
279 name, but we want to retain it, so re-set it
281 dict_rename_var (*dict, var, long_name);
282 var_set_short_name (var, 0, short_name);
285 r->has_long_var_names = true;
288 /* Read record 999 data, which is just filler. */
291 if (claimed_flt64_cnt != -1 && claimed_flt64_cnt != r->flt64_cnt)
292 sys_warn (r, _("File header claims %d variable positions but "
293 "%d were read from file."),
294 claimed_flt64_cnt, r->flt64_cnt);
296 /* Create an index of dictionary variable widths for
297 sfm_read_case to use. We cannot use the `struct variable's
298 from the dictionary we created, because the caller owns the
299 dictionary and may destroy or modify its variables. */
300 r->var_cnt = dict_get_var_cnt (*dict);
301 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
302 for (i = 0; i < r->var_cnt; i++)
304 struct variable *v = dict_get_var (*dict, i);
305 struct sfm_var *sv = &r->vars[i];
306 sv->width = var_get_width (v);
307 sv->case_index = var_get_case_index (v);
310 pool_free (r->pool, var_by_value_idx);
311 r->value_cnt = dict_get_next_value_idx (*dict);
312 return casereader_create_sequential
314 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
315 &sys_file_casereader_class, r);
318 /* Closes a system file after we're done with it.
319 Returns true if an I/O error has occurred on READER, false
322 close_reader (struct sfm_reader *r)
331 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
333 msg (ME, _("Error closing system file \"%s\": %s."),
334 fh_get_file_name (r->fh), strerror (errno));
341 fh_close (r->fh, "system file", "rs");
344 pool_destroy (r->pool);
349 /* Destroys READER. */
351 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
353 struct sfm_reader *r = r_;
357 /* Returns true if FILE is an SPSS system file,
360 sfm_detect (FILE *file)
364 if (fread (rec_type, 4, 1, file) != 1)
368 return !strcmp ("$FL2", rec_type);
371 /* Reads the global header of the system file.
372 Sets DICT's file label to the system file's label.
373 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
374 or to the value index of the weight variable otherwise.
375 Sets *CLAIMED_FLT64_CNT to the number of values that the file
376 claims to have (although it is not always correct).
377 If INFO is non-null, initializes *INFO with header
380 read_header (struct sfm_reader *r, struct dictionary *dict,
381 int *weight_idx, int *claimed_flt64_cnt,
382 struct sfm_read_info *info)
385 char eye_catcher[61];
386 uint8_t raw_layout_code[4];
388 char creation_date[10];
389 char creation_time[9];
391 struct substring file_label_ss;
393 read_string (r, rec_type, sizeof rec_type);
394 read_string (r, eye_catcher, sizeof eye_catcher);
396 if (strcmp ("$FL2", rec_type) != 0)
397 sys_error (r, _("This is not an SPSS system file."));
399 /* Identify integer format. */
400 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
401 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
403 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
405 || (r->integer_format != INTEGER_MSB_FIRST
406 && r->integer_format != INTEGER_LSB_FIRST))
407 sys_error (r, _("This is not an SPSS system file."));
409 *claimed_flt64_cnt = read_int32 (r);
410 if (*claimed_flt64_cnt < 0 || *claimed_flt64_cnt > INT_MAX / 16)
411 *claimed_flt64_cnt = -1;
413 r->compressed = read_int32 (r) != 0;
415 *weight_idx = read_int32 (r);
417 r->case_cnt = read_int32 (r);
418 if ( r->case_cnt > INT_MAX / 2)
422 /* Identify floating-point format and obtain compression bias. */
423 read_bytes (r, raw_bias, sizeof raw_bias);
424 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
426 sys_warn (r, _("Compression bias (%g) is not the usual "
427 "value of 100, or system file uses unrecognized "
428 "floating-point format."),
430 if (r->integer_format == INTEGER_MSB_FIRST)
431 r->float_format = FLOAT_IEEE_DOUBLE_BE;
433 r->float_format = FLOAT_IEEE_DOUBLE_LE;
435 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
437 read_string (r, creation_date, sizeof creation_date);
438 read_string (r, creation_time, sizeof creation_time);
439 read_string (r, file_label, sizeof file_label);
442 file_label_ss = ss_cstr (file_label);
443 ss_trim (&file_label_ss, ss_cstr (" "));
444 if (!ss_is_empty (file_label_ss))
446 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
447 dict_set_label (dict, ss_data (file_label_ss));
452 struct substring product;
454 strcpy (info->creation_date, creation_date);
455 strcpy (info->creation_time, creation_time);
456 info->integer_format = r->integer_format;
457 info->float_format = r->float_format;
458 info->compressed = r->compressed;
459 info->case_cnt = r->case_cnt;
461 product = ss_cstr (eye_catcher);
462 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
463 ss_trim (&product, ss_cstr (" "));
464 str_copy_buf_trunc (info->product, sizeof info->product,
465 ss_data (product), ss_length (product));
469 /* Reads a variable (type 2) record from R and adds the
470 corresponding variable to DICT.
471 Also skips past additional variable records for long string
474 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
475 int *format_warning_cnt)
478 int has_variable_label;
479 int missing_value_code;
484 struct variable *var;
487 width = read_int32 (r);
488 has_variable_label = read_int32 (r);
489 missing_value_code = read_int32 (r);
490 print_format = read_int32 (r);
491 write_format = read_int32 (r);
492 read_string (r, name, sizeof name);
493 name[strcspn (name, " ")] = '\0';
495 /* Check variable name. */
496 if (name[0] == '$' || name[0] == '#')
497 sys_error (r, "Variable name begins with invalid character `%c'.",
499 if (!var_is_plausible_name (name, false))
500 sys_error (r, _("Invalid variable name `%s'."), name);
502 /* Create variable. */
503 if (width < 0 || width > 255)
504 sys_error (r, _("Bad variable width %d."), width);
505 var = dict_create_var (dict, name, width);
508 _("Duplicate variable name `%s' within system file."),
511 /* Set the short name the same as the long name. */
512 var_set_short_name (var, 0, var_get_name (var));
514 /* Get variable label, if any. */
515 if (has_variable_label != 0 && has_variable_label != 1)
516 sys_error (r, _("Variable label indicator field is not 0 or 1."));
517 if (has_variable_label == 1)
522 len = read_int32 (r);
523 if (len >= sizeof label)
524 sys_error (r, _("Variable %s has label of invalid length %u."),
525 name, (unsigned int) len);
526 read_string (r, label, len + 1);
527 var_set_label (var, label);
529 skip_bytes (r, ROUND_UP (len, 4) - len);
532 /* Set missing values. */
533 if (missing_value_code < -3 || missing_value_code > 3
534 || missing_value_code == -1)
535 sys_error (r, _("Missing value indicator field is not "
536 "-3, -2, 0, 1, 2, or 3."));
537 if (missing_value_code != 0)
539 struct missing_values mv;
540 mv_init (&mv, var_get_width (var));
541 if (var_is_numeric (var))
543 if (missing_value_code > 0)
546 for (i = 0; i < missing_value_code; i++)
547 mv_add_num (&mv, read_flt64 (r));
551 double low = read_flt64 (r);
552 double high = read_flt64 (r);
553 mv_add_num_range (&mv, low, high);
554 if (missing_value_code == -3)
555 mv_add_num (&mv, read_flt64 (r));
558 else if (var_get_width (var) <= MAX_SHORT_STRING)
560 if (missing_value_code > 0)
563 for (i = 0; i < missing_value_code; i++)
566 read_string (r, string, sizeof string);
567 mv_add_str (&mv, string);
571 sys_error (r, _("String variable %s may not have missing "
572 "values specified as a range."),
575 else /* var->width > MAX_SHORT_STRING */
576 sys_error (r, _("Long string variable %s may not have missing "
579 var_set_missing_values (var, &mv);
583 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
584 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
586 /* Account for values.
587 Skip long string continuation records, if any. */
588 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
594 for (i = 1; i < nv; i++)
596 /* Check for record type 2 and width -1. */
597 if (read_int32 (r) != 2 || read_int32 (r) != -1)
598 sys_error (r, _("Missing string continuation record."));
600 /* Skip and ignore remaining continuation data. */
601 has_variable_label = read_int32 (r);
602 missing_value_code = read_int32 (r);
603 print_format = read_int32 (r);
604 write_format = read_int32 (r);
605 read_string (r, name, sizeof name);
607 /* Variable label fields on continuation records have
608 been spotted in system files created by "SPSS Power
609 Macintosh Release 6.1". */
610 if (has_variable_label)
611 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
616 /* Translates the format spec from sysfile format to internal
619 parse_format_spec (struct sfm_reader *r, uint32_t s,
620 enum which_format which, struct variable *v,
621 int *format_warning_cnt)
623 const int max_format_warnings = 8;
625 uint8_t raw_type = s >> 16;
631 if (!fmt_from_io (raw_type, &f.type))
632 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
637 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
642 if (which == PRINT_FORMAT)
643 var_set_print_format (v, &f);
645 var_set_write_format (v, &f);
647 else if (*++format_warning_cnt <= max_format_warnings)
649 char fmt_string[FMT_STRING_LEN_MAX + 1];
650 sys_warn (r, _("%s variable %s has invalid %s format %s."),
651 var_is_numeric (v) ? _("Numeric") : _("String"),
653 which == PRINT_FORMAT ? _("print") : _("write"),
654 fmt_to_string (&f, fmt_string));
656 if (*format_warning_cnt == max_format_warnings)
657 sys_warn (r, _("Suppressing further invalid format warnings."));
661 /* Sets the weighting variable in DICT to the variable
662 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
665 setup_weight (struct sfm_reader *r, int weight_idx,
666 struct variable **var_by_value_idx, struct dictionary *dict)
670 struct variable *weight_var
671 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
672 if (var_is_numeric (weight_var))
673 dict_set_weight (dict, weight_var);
675 sys_error (r, _("Weighting variable must be numeric."));
679 /* Reads a document record, type 6, from system file R, and sets up
680 the documents and n_documents fields in the associated
683 read_documents (struct sfm_reader *r, struct dictionary *dict)
688 if (dict_get_documents (dict) != NULL)
689 sys_error (r, _("Multiple type 6 (document) records."));
691 line_cnt = read_int32 (r);
693 sys_error (r, _("Number of document lines (%d) "
694 "must be greater than 0."), line_cnt);
696 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
697 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
698 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
699 dict_set_documents (dict, documents);
701 sys_error (r, _("Document line contains null byte."));
702 pool_free (r->pool, documents);
705 /* Read a type 7 extension record. */
707 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
709 int subtype = read_int32 (r);
710 size_t size = read_int32 (r);
711 size_t count = read_int32 (r);
712 size_t bytes = size * count;
714 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
715 allows an extra byte for a null terminator, used by some
716 extension processing routines. */
717 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
718 sys_error (r, "Record type 7 subtype %d too large.", subtype);
723 read_machine_int32_info (r, size, count);
727 read_machine_flt64_info (r, size, count);
731 /* Variable sets information. We don't use these yet.
732 They only apply to GUIs; see VARSETS on the APPLY
733 DICTIONARY command in SPSS documentation. */
737 /* DATE variable information. We don't use it yet, but we
742 /* Unknown purpose. */
746 read_display_parameters (r, size, count, dict);
750 read_long_var_name_map (r, size, count, dict);
754 read_long_string_map (r, size, count, dict);
758 /* New in SPSS v14? Unknown purpose. */
762 /* Text field that defines variable attributes. New in
767 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
771 skip_bytes (r, bytes);
774 /* Read record type 7, subtype 3. */
776 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
778 int version_major UNUSED = read_int32 (r);
779 int version_minor UNUSED = read_int32 (r);
780 int version_revision UNUSED = read_int32 (r);
781 int machine_code UNUSED = read_int32 (r);
782 int float_representation = read_int32 (r);
783 int compression_code UNUSED = read_int32 (r);
784 int integer_representation = read_int32 (r);
785 int character_code UNUSED = read_int32 (r);
787 int expected_float_format;
788 int expected_integer_format;
790 if (size != 4 || count != 8)
791 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
793 (unsigned int) size, (unsigned int) count);
795 /* Check floating point format. */
796 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
797 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
798 expected_float_format = 1;
799 else if (r->float_format == FLOAT_Z_LONG)
800 expected_float_format = 2;
801 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
802 expected_float_format = 3;
805 if (float_representation != expected_float_format)
806 sys_error (r, _("Floating-point representation indicated by "
807 "system file (%d) differs from expected (%d)."),
808 r->float_format, expected_float_format);
810 /* Check integer format. */
811 if (r->integer_format == INTEGER_MSB_FIRST)
812 expected_integer_format = 1;
813 else if (r->integer_format == INTEGER_LSB_FIRST)
814 expected_integer_format = 2;
817 if (integer_representation != expected_integer_format)
819 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
820 sys_warn (r, _("Integer format indicated by system file (%s) "
821 "differs from expected (%s)."),
822 gettext (endian[integer_representation == 1]),
823 gettext (endian[expected_integer_format == 1]));
827 /* Read record type 7, subtype 4. */
829 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
831 double sysmis = read_flt64 (r);
832 double highest = read_flt64 (r);
833 double lowest = read_flt64 (r);
835 if (size != 8 || count != 3)
836 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
837 (unsigned int) size, (unsigned int) count);
839 if (sysmis != SYSMIS)
840 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
841 if (highest != HIGHEST)
842 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
843 if (lowest != LOWEST)
844 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
847 /* Read record type 7, subtype 11, which specifies how variables
848 should be displayed in GUI environments. */
850 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
851 struct dictionary *dict)
853 const size_t n_vars = count / 3 ;
857 if (count % 3 || n_vars != dict_get_var_cnt (dict))
858 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
859 (unsigned int) size, (unsigned int) count);
861 for (i = 0; i < n_vars; ++i)
863 int measure = read_int32 (r);
864 int width = read_int32 (r);
865 int align = read_int32 (r);
866 struct variable *v = dict_get_var (dict, i);
868 /* spss v14 sometimes seems to set string variables' measure to zero */
869 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
872 if (measure < 1 || measure > 3 || align < 0 || align > 2)
875 sys_warn (r, _("Invalid variable display parameters. "
876 "Default parameters substituted."));
881 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
882 : measure == 2 ? MEASURE_ORDINAL
884 var_set_display_width (v, width);
885 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
886 : align == 1 ? ALIGN_RIGHT
891 /* Reads record type 7, subtype 13, which gives the long name
892 that corresponds to each short name. Modifies variable names
893 in DICT accordingly. */
895 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
896 struct dictionary *dict)
898 struct variable_to_value_map *map;
899 struct variable *var;
903 map = open_variable_to_value_map (r, size * count);
904 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
908 size_t short_name_cnt;
911 /* Validate long name. */
912 if (!var_is_valid_name (long_name, false))
914 sys_warn (r, _("Long variable mapping from %s to invalid "
915 "variable name `%s'."),
916 var_get_name (var), long_name);
920 /* Identify any duplicates. */
921 if (strcasecmp (var_get_short_name (var, 0), long_name)
922 && dict_lookup_var (dict, long_name) != NULL)
924 sys_warn (r, _("Duplicate long variable name `%s' "
925 "within system file."), long_name);
929 /* Renaming a variable may clear its short names, but we
930 want to retain them, so we save them and re-set them
932 short_name_cnt = var_get_short_name_cnt (var);
933 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
934 for (i = 0; i < short_name_cnt; i++)
936 const char *s = var_get_short_name (var, i);
937 short_names[i] = s != NULL ? xstrdup (s) : NULL;
941 dict_rename_var (dict, var, long_name);
943 /* Restore short names. */
944 for (i = 0; i < short_name_cnt; i++)
946 var_set_short_name (var, i, short_names[i]);
947 free (short_names[i]);
950 close_variable_to_value_map (r, map);
951 r->has_long_var_names = true;
954 /* Reads record type 7, subtype 14, which gives the real length
955 of each very long string. Rearranges DICT accordingly. */
957 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
958 struct dictionary *dict)
960 struct variable_to_value_map *map;
961 struct variable *var;
967 map = open_variable_to_value_map (r, size * count);
968 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
971 long length, remaining_length;
975 length = strtol (length_s, NULL, 10);
976 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
978 sys_warn (r, _("%s listed as string of length %s "
980 var_get_name (var), length_s);
984 /* Group multiple variables into single variable
985 and delete all but the first. */
986 remaining_length = length;
987 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
988 if (idx < dict_get_var_cnt (dict))
989 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
990 EFFECTIVE_LONG_STRING_LENGTH);
992 sys_error (r, _("Very long string %s overflows dictionary."),
994 dict_delete_consecutive_vars (dict,
995 var_get_dict_index (var) + 1,
996 idx - var_get_dict_index (var) - 1);
998 /* Assign all the length to the first variable. */
999 var_set_width (var, length);
1001 close_variable_to_value_map (r, map);
1002 dict_compact_values (dict);
1005 /* Reads value labels from sysfile H and inserts them into the
1006 associated dictionary. */
1008 read_value_labels (struct sfm_reader *r,
1009 struct dictionary *dict, struct variable **var_by_value_idx)
1011 struct pool *subpool;
1015 char raw_value[8]; /* Value as uninterpreted bytes. */
1016 union value value; /* Value. */
1017 char *label; /* Null-terminated label string. */
1020 struct label *labels = NULL;
1021 int label_cnt; /* Number of labels. */
1023 struct variable **var = NULL; /* Associated variables. */
1024 int var_cnt; /* Number of associated variables. */
1028 subpool = pool_create_subpool (r->pool);
1030 /* Read the type 3 record and record its contents. We can't do
1031 much with the data yet because we don't know whether it is
1032 of numeric or string type. */
1034 /* Read number of labels. */
1035 label_cnt = read_int32 (r);
1037 if (label_cnt >= INT32_MAX / sizeof *labels)
1039 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1044 /* Read each value/label tuple into labels[]. */
1045 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1046 for (i = 0; i < label_cnt; i++)
1048 struct label *label = labels + i;
1049 unsigned char label_len;
1053 read_bytes (r, label->raw_value, sizeof label->raw_value);
1055 /* Read label length. */
1056 read_bytes (r, &label_len, sizeof label_len);
1057 padded_len = ROUND_UP (label_len + 1, 8);
1059 /* Read label, padding. */
1060 label->label = pool_alloc (subpool, padded_len + 1);
1061 read_bytes (r, label->label, padded_len - 1);
1062 label->label[label_len] = 0;
1065 /* Now, read the type 4 record that has the list of variables
1066 to which the value labels are to be applied. */
1068 /* Read record type of type 4 record. */
1069 if (read_int32 (r) != 4)
1070 sys_error (r, _("Variable index record (type 4) does not immediately "
1071 "follow value label record (type 3) as it should."));
1073 /* Read number of variables associated with value label from type 4
1075 var_cnt = read_int32 (r);
1076 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1077 sys_error (r, _("Number of variables associated with a value label (%d) "
1078 "is not between 1 and the number of variables (%u)."),
1079 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1081 /* Read the list of variables. */
1082 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1083 for (i = 0; i < var_cnt; i++)
1085 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1086 if (var_is_long_string (var[i]))
1087 sys_error (r, _("Value labels are not allowed on long string "
1088 "variables (%s)."), var_get_name (var[i]));
1091 /* Type check the variables. */
1092 for (i = 1; i < var_cnt; i++)
1093 if (var_get_type (var[i]) != var_get_type (var[0]))
1094 sys_error (r, _("Variables associated with value label are not all of "
1095 "identical type. Variable %s is %s, but variable "
1097 var_get_name (var[0]),
1098 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1099 var_get_name (var[i]),
1100 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1102 /* Fill in labels[].value, now that we know the desired type. */
1103 for (i = 0; i < label_cnt; i++)
1105 struct label *label = labels + i;
1107 if (var_is_alpha (var[0]))
1108 buf_copy_rpad (label->value.s, sizeof label->value.s,
1109 label->raw_value, sizeof label->raw_value);
1111 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1114 /* Assign the `value_label's to each variable. */
1115 for (i = 0; i < var_cnt; i++)
1117 struct variable *v = var[i];
1120 /* Add each label to the variable. */
1121 for (j = 0; j < label_cnt; j++)
1123 struct label *label = &labels[j];
1124 if (!var_add_value_label (v, &label->value, label->label))
1126 if (var_is_numeric (var[0]))
1127 sys_warn (r, _("Duplicate value label for %g on %s."),
1128 label->value.f, var_get_name (v));
1130 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1131 var_get_width (v), label->value.s,
1137 pool_destroy (subpool);
1142 static void partial_record (struct sfm_reader *r)
1145 static void read_error (struct casereader *, const struct sfm_reader *);
1148 static bool read_case_number (struct sfm_reader *, double *);
1149 static bool read_case_string (struct sfm_reader *, char *, size_t);
1150 static int read_opcode (struct sfm_reader *);
1151 static bool read_compressed_number (struct sfm_reader *, double *);
1152 static bool read_compressed_string (struct sfm_reader *, char *);
1153 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1155 /* Reads one case from READER's file into C. Returns true only
1158 sys_file_casereader_read (struct casereader *reader, void *r_,
1161 struct sfm_reader *r = r_;
1165 case_create (c, r->value_cnt);
1166 if (setjmp (r->bail_out))
1168 casereader_force_error (reader);
1173 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1175 /* Fast path. Read the whole case directly. */
1176 if (!try_read_bytes (r, case_data_all_rw (c),
1177 sizeof (union value) * r->flt64_cnt))
1180 if ( r->case_cnt != -1 )
1181 read_error (reader, r);
1185 /* Convert floating point numbers to native format if needed. */
1186 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1190 for (i = 0; i < r->var_cnt; i++)
1191 if (r->vars[i].width == 0)
1193 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1194 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1201 /* Slow path. Convert from external to internal format. */
1204 for (i = 0; i < r->var_cnt; i++)
1206 struct sfm_var *sv = &r->vars[i];
1207 union value *v = case_data_rw_idx (c, sv->case_index);
1211 if (!read_case_number (r, &v->f))
1216 /* Read the string data in segments up to 255 bytes
1217 at a time, packed into 8-byte units. */
1218 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1219 int ofs, chunk_size;
1220 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1222 chunk_size = MIN (max_chunk, sv->width - ofs);
1223 if (!read_case_string (r, v->s + ofs, chunk_size))
1231 /* Very long strings have trailing wasted space
1232 that we must skip. */
1233 if (sv->width >= MIN_VERY_LONG_STRING)
1235 int bytes_read = (sv->width / max_chunk * 256
1236 + ROUND_UP (sv->width % max_chunk, 8));
1237 int total_bytes = sfm_width_to_bytes (sv->width);
1238 int excess_bytes = total_bytes - bytes_read;
1240 while (excess_bytes > 0)
1243 size_t chunk = MIN (sizeof buffer, excess_bytes);
1244 if (!read_whole_strings (r, buffer, chunk))
1246 excess_bytes -= chunk;
1257 if ( r->case_cnt != -1 )
1258 read_error (reader, r);
1263 /* Issues an error that R ends in a partial record. */
1265 partial_record (struct sfm_reader *r)
1267 sys_error (r, _("File ends in partial case."));
1271 read_error (struct casereader *r, const struct sfm_reader *sfm)
1273 msg (ME, _("Error reading case from file %s"), fh_get_name (sfm->fh));
1274 casereader_force_error (r);
1277 /* Reads a number from R and stores its value in *D.
1278 If R is compressed, reads a compressed number;
1279 otherwise, reads a number in the regular way.
1280 Returns true if successful, false if end of file is
1281 reached immediately. */
1283 read_case_number (struct sfm_reader *r, double *d)
1288 if (!try_read_bytes (r, flt64, sizeof flt64))
1290 *d = flt64_to_double (r, flt64);
1294 return read_compressed_number (r, d);
1297 /* Reads LENGTH string bytes from R into S.
1298 Always reads a multiple of 8 bytes; if LENGTH is not a
1299 multiple of 8, then extra bytes are read and discarded without
1301 Reads compressed strings if S is compressed.
1302 Returns true if successful, false if end of file is
1303 reached immediately. */
1305 read_case_string (struct sfm_reader *r, char *s, size_t length)
1307 size_t whole = ROUND_DOWN (length, 8);
1308 size_t partial = length % 8;
1312 if (!read_whole_strings (r, s, whole))
1319 if (!read_whole_strings (r, bounce, sizeof bounce))
1325 memcpy (s + whole, bounce, partial);
1331 /* Reads and returns the next compression opcode from R. */
1333 read_opcode (struct sfm_reader *r)
1335 assert (r->compressed);
1339 if (r->opcode_idx >= sizeof r->opcodes)
1341 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1345 opcode = r->opcodes[r->opcode_idx++];
1352 /* Reads a compressed number from R and stores its value in D.
1353 Returns true if successful, false if end of file is
1354 reached immediately. */
1356 read_compressed_number (struct sfm_reader *r, double *d)
1358 int opcode = read_opcode (r);
1366 *d = read_flt64 (r);
1370 sys_error (r, _("Compressed data is corrupt."));
1377 *d = opcode - r->bias;
1384 /* Reads a compressed 8-byte string segment from R and stores it
1386 Returns true if successful, false if end of file is
1387 reached immediately. */
1389 read_compressed_string (struct sfm_reader *r, char *dst)
1391 switch (read_opcode (r))
1398 read_bytes (r, dst, 8);
1402 memset (dst, ' ', 8);
1406 sys_error (r, _("Compressed data is corrupt."));
1412 /* Reads LENGTH string bytes from R into S.
1413 LENGTH must be a multiple of 8.
1414 Reads compressed strings if S is compressed.
1415 Returns true if successful, false if end of file is
1416 reached immediately. */
1418 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1420 assert (length % 8 == 0);
1422 return try_read_bytes (r, s, length);
1426 for (ofs = 0; ofs < length; ofs += 8)
1427 if (!read_compressed_string (r, s + ofs))
1437 /* Creates and returns a table that can be used for translating a value
1438 index into a case to a "struct variable *" for DICT. Multiple
1439 system file fields reference variables this way.
1441 This table must be created before processing the very long
1442 string extension record, because that record causes some
1443 values to be deleted from the case and the dictionary to be
1445 static struct variable **
1446 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1448 struct variable **var_by_value_idx;
1452 var_by_value_idx = pool_nmalloc (r->pool,
1453 r->flt64_cnt, sizeof *var_by_value_idx);
1454 for (i = 0; i < dict_get_var_cnt (dict); i++)
1456 struct variable *v = dict_get_var (dict, i);
1457 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1460 var_by_value_idx[value_idx++] = v;
1461 for (j = 1; j < nv; j++)
1462 var_by_value_idx[value_idx++] = NULL;
1464 assert (value_idx == r->flt64_cnt);
1466 return var_by_value_idx;
1469 /* Returns the "struct variable" corresponding to the given
1470 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1472 static struct variable *
1473 lookup_var_by_value_idx (struct sfm_reader *r,
1474 struct variable **var_by_value_idx, int value_idx)
1476 struct variable *var;
1478 if (value_idx < 1 || value_idx > r->flt64_cnt)
1479 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1480 value_idx, r->flt64_cnt);
1482 var = var_by_value_idx[value_idx - 1];
1484 sys_error (r, _("Variable index %d refers to long string "
1491 /* Returns the variable in D with the given SHORT_NAME,
1492 or a null pointer if there is none. */
1493 static struct variable *
1494 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1496 struct variable *var;
1500 /* First try looking up by full name. This often succeeds. */
1501 var = dict_lookup_var (d, short_name);
1502 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
1505 /* Iterate through the whole dictionary as a fallback. */
1506 var_cnt = dict_get_var_cnt (d);
1507 for (i = 0; i < var_cnt; i++)
1509 var = dict_get_var (d, i);
1510 if (!strcasecmp (var_get_short_name (var, 0), short_name))
1517 /* Helpers for reading records that contain "variable=value"
1521 struct variable_to_value_map
1523 struct substring buffer; /* Record contents. */
1524 size_t pos; /* Current position in buffer. */
1527 /* Reads SIZE bytes into a "variable=value" map for R,
1528 and returns the map. */
1529 static struct variable_to_value_map *
1530 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1532 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1533 char *buffer = pool_malloc (r->pool, size + 1);
1534 read_bytes (r, buffer, size);
1535 map->buffer = ss_buffer (buffer, size);
1540 /* Closes MAP and frees its storage.
1541 Not really needed, because the pool will free the map anyway,
1542 but can be used to free it earlier. */
1544 close_variable_to_value_map (struct sfm_reader *r,
1545 struct variable_to_value_map *map)
1547 pool_free (r->pool, ss_data (map->buffer));
1550 /* Reads the next variable=value pair from MAP.
1551 Looks up the variable in DICT and stores it into *VAR.
1552 Stores a null-terminated value into *VALUE. */
1554 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1555 struct variable_to_value_map *map,
1556 struct variable **var, char **value,
1559 int max_warnings = 5;
1563 struct substring short_name_ss, value_ss;
1565 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1566 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1569 if (*warning_cnt > max_warnings)
1570 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1571 *warning_cnt - max_warnings);
1575 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1576 ss_buffer ("\t\0", 2));
1578 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1579 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1582 if (++*warning_cnt <= 5)
1583 sys_warn (r, _("Variable map refers to unknown variable %s."),
1584 ss_data (short_name_ss));
1588 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1589 *value = ss_data (value_ss);
1597 /* Displays a corruption message. */
1599 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1604 ds_init_empty (&text);
1605 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1606 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1607 ds_put_vformat (&text, format, args);
1609 m.category = msg_class_to_category (class);
1610 m.severity = msg_class_to_severity (class);
1611 m.where.file_name = NULL;
1612 m.where.line_number = 0;
1613 m.text = ds_cstr (&text);
1618 /* Displays a warning for the current file position. */
1620 sys_warn (struct sfm_reader *r, const char *format, ...)
1624 va_start (args, format);
1625 sys_msg (r, MW, format, args);
1629 /* Displays an error for the current file position,
1630 marks it as in an error state,
1631 and aborts reading it using longjmp. */
1633 sys_error (struct sfm_reader *r, const char *format, ...)
1637 va_start (args, format);
1638 sys_msg (r, ME, format, args);
1642 longjmp (r->bail_out, 1);
1645 /* Reads BYTE_CNT bytes into BUF.
1646 Returns true if exactly BYTE_CNT bytes are successfully read.
1647 Aborts if an I/O error or a partial read occurs.
1648 If EOF_IS_OK, then an immediate end-of-file causes false to be
1649 returned; otherwise, immediate end-of-file causes an abort
1652 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1653 void *buf, size_t byte_cnt)
1655 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1656 if (bytes_read == byte_cnt)
1658 else if (ferror (r->file))
1659 sys_error (r, _("System error: %s."), strerror (errno));
1660 else if (!eof_is_ok || bytes_read != 0)
1661 sys_error (r, _("Unexpected end of file."));
1666 /* Reads BYTE_CNT into BUF.
1667 Aborts upon I/O error or if end-of-file is encountered. */
1669 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1671 read_bytes_internal (r, false, buf, byte_cnt);
1674 /* Reads BYTE_CNT bytes into BUF.
1675 Returns true if exactly BYTE_CNT bytes are successfully read.
1676 Returns false if an immediate end-of-file is encountered.
1677 Aborts if an I/O error or a partial read occurs. */
1679 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1681 return read_bytes_internal (r, true, buf, byte_cnt);
1684 /* Reads a 32-bit signed integer from R and returns its value in
1687 read_int32 (struct sfm_reader *r)
1690 read_bytes (r, int32, sizeof int32);
1691 return int32_to_native (r, int32);
1694 /* Reads a 64-bit floating-point number from R and returns its
1695 value in host format. */
1697 read_flt64 (struct sfm_reader *r)
1700 read_bytes (r, flt64, sizeof flt64);
1701 return flt64_to_double (r, flt64);
1704 /* Reads exactly SIZE - 1 bytes into BUFFER
1705 and stores a null byte into BUFFER[SIZE - 1]. */
1707 read_string (struct sfm_reader *r, char *buffer, size_t size)
1710 read_bytes (r, buffer, size - 1);
1711 buffer[size - 1] = '\0';
1714 /* Skips BYTES bytes forward in R. */
1716 skip_bytes (struct sfm_reader *r, size_t bytes)
1721 size_t chunk = MIN (sizeof buffer, bytes);
1722 read_bytes (r, buffer, chunk);
1727 /* Returns the value of the 32-bit signed integer at INT32,
1728 converted from the format used by R to the host format. */
1730 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1733 if (r->integer_format == INTEGER_NATIVE)
1734 memcpy (&x, int32, sizeof x);
1736 x = integer_get (r->integer_format, int32, sizeof x);
1740 /* Returns the value of the 64-bit floating point number at
1741 FLT64, converted from the format used by R to the host
1744 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1747 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1748 memcpy (&x, flt64, sizeof x);
1750 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);
1754 static struct casereader_class sys_file_casereader_class =
1756 sys_file_casereader_read,
1757 sys_file_casereader_destroy,