1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <data/sys-file-reader.h>
20 #include <data/sys-file-private.h>
28 #include <libpspp/alloc.h>
29 #include <libpspp/assertion.h>
30 #include <libpspp/message.h>
31 #include <libpspp/compiler.h>
32 #include <libpspp/magic.h>
33 #include <libpspp/misc.h>
34 #include <libpspp/pool.h>
35 #include <libpspp/str.h>
36 #include <libpspp/hash.h>
37 #include <libpspp/array.h>
39 #include <data/case.h>
40 #include <data/casereader-provider.h>
41 #include <data/casereader.h>
42 #include <data/dictionary.h>
43 #include <data/file-handle-def.h>
44 #include <data/file-name.h>
45 #include <data/format.h>
46 #include <data/missing-values.h>
47 #include <data/value-labels.h>
48 #include <data/variable.h>
49 #include <data/value.h>
54 #include "unlocked-io.h"
58 #define _(msgid) gettext (msgid)
59 #define N_(msgid) (msgid)
61 /* System file reader. */
64 /* Resource tracking. */
65 struct pool *pool; /* All system file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 FILE *file; /* File stream. */
71 bool error; /* I/O or corruption error? */
72 size_t value_cnt; /* Number of "union value"s in struct case. */
75 enum integer_format integer_format; /* On-disk integer format. */
76 enum float_format float_format; /* On-disk floating point format. */
77 int flt64_cnt; /* Number of 8-byte units per case. */
78 struct sfm_var *vars; /* Variables. */
79 size_t var_cnt; /* Number of variables. */
80 int32_t case_cnt; /* Number of cases */
81 bool has_long_var_names; /* File has a long variable name map */
82 bool has_vls; /* File has one or more very long strings? */
85 bool compressed; /* File is compressed? */
86 double bias; /* Compression bias, usually 100.0. */
87 uint8_t opcodes[8]; /* Current block of opcodes. */
88 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 /* A variable in a system file. */
94 int width; /* 0=numeric, otherwise string width. */
95 int case_index; /* Index into case. */
98 static struct casereader_class sys_file_casereader_class;
100 static bool close_reader (struct sfm_reader *);
102 static struct variable **make_var_by_value_idx (struct sfm_reader *,
103 struct dictionary *);
104 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
108 static void sys_warn (struct sfm_reader *, const char *, ...)
109 PRINTF_FORMAT (2, 3);
111 static void sys_error (struct sfm_reader *, const char *, ...)
115 static void read_bytes (struct sfm_reader *, void *, size_t);
116 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
117 static int32_t read_int32 (struct sfm_reader *);
118 static double read_flt64 (struct sfm_reader *);
119 static void read_string (struct sfm_reader *, char *, size_t);
120 static void skip_bytes (struct sfm_reader *, size_t);
122 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
123 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
125 static struct variable_to_value_map *open_variable_to_value_map (
126 struct sfm_reader *, size_t size);
127 static void close_variable_to_value_map (struct sfm_reader *r,
128 struct variable_to_value_map *);
129 static bool read_variable_to_value_map (struct sfm_reader *,
131 struct variable_to_value_map *,
132 struct variable **var, char **value,
135 static bool close_reader (struct sfm_reader *r);
137 /* Dictionary reader. */
145 static void read_header (struct sfm_reader *, struct dictionary *,
146 int *weight_idx, int *claimed_flt64_cnt,
147 struct sfm_read_info *);
148 static void read_variable_record (struct sfm_reader *, struct dictionary *,
149 int *format_warning_cnt);
150 static void parse_format_spec (struct sfm_reader *, uint32_t,
151 enum which_format, struct variable *,
152 int *format_warning_cnt);
153 static void setup_weight (struct sfm_reader *, int weight_idx,
154 struct variable **var_by_value_idx,
155 struct dictionary *);
156 static void read_documents (struct sfm_reader *, struct dictionary *);
157 static void read_value_labels (struct sfm_reader *, struct dictionary *,
158 struct variable **var_by_value_idx);
160 static void read_extension_record (struct sfm_reader *, struct dictionary *);
161 static void read_machine_int32_info (struct sfm_reader *,
162 size_t size, size_t count);
163 static void read_machine_flt64_info (struct sfm_reader *,
164 size_t size, size_t count);
165 static void read_display_parameters (struct sfm_reader *,
166 size_t size, size_t count,
167 struct dictionary *);
168 static void read_long_var_name_map (struct sfm_reader *,
169 size_t size, size_t count,
170 struct dictionary *);
171 static void read_long_string_map (struct sfm_reader *,
172 size_t size, size_t count,
173 struct dictionary *);
176 /* Opens the system file designated by file handle FH for
177 reading. Reads the system file's dictionary into *DICT.
178 If INFO is non-null, then it receives additional info about the
181 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
182 struct sfm_read_info *info)
184 struct sfm_reader *volatile r = NULL;
185 struct variable **var_by_value_idx;
186 int format_warning_cnt = 0;
188 int claimed_flt64_cnt;
192 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
195 *dict = dict_create ();
197 /* Create and initialize reader. */
198 r = pool_create_container (struct sfm_reader, pool);
200 r->file = fn_open (fh_get_file_name (fh), "rb");
204 r->has_long_var_names = false;
205 r->opcode_idx = sizeof r->opcodes;
207 if (setjmp (r->bail_out))
210 dict_destroy (*dict);
217 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
218 fh_get_file_name (r->fh), strerror (errno));
219 longjmp (r->bail_out, 1);
223 read_header (r, *dict, &weight_idx, &claimed_flt64_cnt, info);
225 /* Read all the variable definition records. */
226 rec_type = read_int32 (r);
227 while (rec_type == 2)
229 read_variable_record (r, *dict, &format_warning_cnt);
230 rec_type = read_int32 (r);
233 /* Figure out the case format. */
234 var_by_value_idx = make_var_by_value_idx (r, *dict);
235 setup_weight (r, weight_idx, var_by_value_idx, *dict);
237 /* Read all the rest of the dictionary records. */
238 while (rec_type != 999)
243 read_value_labels (r, *dict, var_by_value_idx);
247 sys_error (r, _("Misplaced type 4 record."));
250 read_documents (r, *dict);
254 read_extension_record (r, *dict);
258 sys_error (r, _("Unrecognized record type %d."), rec_type);
260 rec_type = read_int32 (r);
264 if ( ! r->has_long_var_names )
267 for (i = 0; i < dict_get_var_cnt (*dict); i++)
269 struct variable *var = dict_get_var (*dict, i);
270 char short_name [SHORT_NAME_LEN + 1];
271 char long_name [SHORT_NAME_LEN + 1];
273 strcpy (short_name, var_get_name (var));
275 strcpy (long_name, short_name);
276 str_lowercase (long_name);
278 /* Set long name. Renaming a variable may clear the short
279 name, but we want to retain it, so re-set it
281 dict_rename_var (*dict, var, long_name);
282 var_set_short_name (var, short_name);
285 r->has_long_var_names = true;
288 /* Read record 999 data, which is just filler. */
291 if (claimed_flt64_cnt != -1 && claimed_flt64_cnt != r->flt64_cnt)
292 sys_warn (r, _("File header claims %d variable positions but "
293 "%d were read from file."),
294 claimed_flt64_cnt, r->flt64_cnt);
296 /* Create an index of dictionary variable widths for
297 sfm_read_case to use. We cannot use the `struct variable's
298 from the dictionary we created, because the caller owns the
299 dictionary and may destroy or modify its variables. */
300 r->var_cnt = dict_get_var_cnt (*dict);
301 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
302 for (i = 0; i < r->var_cnt; i++)
304 struct variable *v = dict_get_var (*dict, i);
305 struct sfm_var *sv = &r->vars[i];
306 sv->width = var_get_width (v);
307 sv->case_index = var_get_case_index (v);
310 pool_free (r->pool, var_by_value_idx);
311 r->value_cnt = dict_get_next_value_idx (*dict);
312 return casereader_create_sequential
314 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
315 &sys_file_casereader_class, r);
318 /* Closes a system file after we're done with it.
319 Returns true if an I/O error has occurred on READER, false
322 close_reader (struct sfm_reader *r)
331 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
333 msg (ME, _("Error closing system file \"%s\": %s."),
334 fh_get_file_name (r->fh), strerror (errno));
341 fh_close (r->fh, "system file", "rs");
344 pool_destroy (r->pool);
349 /* Destroys READER. */
351 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
353 struct sfm_reader *r = r_;
357 /* Returns true if FILE is an SPSS system file,
360 sfm_detect (FILE *file)
364 if (fread (rec_type, 4, 1, file) != 1)
368 return !strcmp ("$FL2", rec_type);
371 /* Reads the global header of the system file.
372 Sets DICT's file label to the system file's label.
373 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
374 or to the value index of the weight variable otherwise.
375 Sets *CLAIMED_FLT64_CNT to the number of values that the file
376 claims to have (although it is not always correct).
377 If INFO is non-null, initializes *INFO with header
380 read_header (struct sfm_reader *r, struct dictionary *dict,
381 int *weight_idx, int *claimed_flt64_cnt,
382 struct sfm_read_info *info)
385 char eye_catcher[61];
386 uint8_t raw_layout_code[4];
388 char creation_date[10];
389 char creation_time[9];
391 struct substring file_label_ss;
393 read_string (r, rec_type, sizeof rec_type);
394 read_string (r, eye_catcher, sizeof eye_catcher);
396 if (strcmp ("$FL2", rec_type) != 0)
397 sys_error (r, _("This is not an SPSS system file."));
399 /* Identify integer format. */
400 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
401 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
403 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
405 || (r->integer_format != INTEGER_MSB_FIRST
406 && r->integer_format != INTEGER_LSB_FIRST))
407 sys_error (r, _("This is not an SPSS system file."));
409 *claimed_flt64_cnt = read_int32 (r);
410 if (*claimed_flt64_cnt < 0 || *claimed_flt64_cnt > INT_MAX / 16)
411 *claimed_flt64_cnt = -1;
413 r->compressed = read_int32 (r) != 0;
415 *weight_idx = read_int32 (r);
417 r->case_cnt = read_int32 (r);
418 if ( r->case_cnt > INT_MAX / 2)
422 /* Identify floating-point format and obtain compression bias. */
423 read_bytes (r, raw_bias, sizeof raw_bias);
424 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
426 sys_warn (r, _("Compression bias (%g) is not the usual "
427 "value of 100, or system file uses unrecognized "
428 "floating-point format."),
430 if (r->integer_format == INTEGER_MSB_FIRST)
431 r->float_format = FLOAT_IEEE_DOUBLE_BE;
433 r->float_format = FLOAT_IEEE_DOUBLE_LE;
435 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
437 read_string (r, creation_date, sizeof creation_date);
438 read_string (r, creation_time, sizeof creation_time);
439 read_string (r, file_label, sizeof file_label);
442 file_label_ss = ss_cstr (file_label);
443 ss_trim (&file_label_ss, ss_cstr (" "));
444 if (!ss_is_empty (file_label_ss))
446 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
447 dict_set_label (dict, ss_data (file_label_ss));
452 struct substring product;
454 strcpy (info->creation_date, creation_date);
455 strcpy (info->creation_time, creation_time);
456 info->integer_format = r->integer_format;
457 info->float_format = r->float_format;
458 info->compressed = r->compressed;
459 info->case_cnt = r->case_cnt;
461 product = ss_cstr (eye_catcher);
462 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
463 ss_trim (&product, ss_cstr (" "));
464 str_copy_buf_trunc (info->product, sizeof info->product,
465 ss_data (product), ss_length (product));
469 /* Reads a variable (type 2) record from R and adds the
470 corresponding variable to DICT.
471 Also skips past additional variable records for long string
474 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
475 int *format_warning_cnt)
478 int has_variable_label;
479 int missing_value_code;
484 struct variable *var;
487 width = read_int32 (r);
488 has_variable_label = read_int32 (r);
489 missing_value_code = read_int32 (r);
490 print_format = read_int32 (r);
491 write_format = read_int32 (r);
492 read_string (r, name, sizeof name);
493 name[strcspn (name, " ")] = '\0';
495 /* Check variable name. */
496 if (name[0] == '$' || name[0] == '#')
497 sys_error (r, "Variable name begins with invalid character `%c'.",
499 if (!var_is_plausible_name (name, false))
500 sys_error (r, _("Invalid variable name `%s'."), name);
502 /* Create variable. */
503 if (width < 0 || width > 255)
504 sys_error (r, _("Bad variable width %d."), width);
505 var = dict_create_var (dict, name, width);
508 _("Duplicate variable name `%s' within system file."),
511 /* Set the short name the same as the long name */
512 var_set_short_name (var, var_get_name (var));
514 /* Get variable label, if any. */
515 if (has_variable_label != 0 && has_variable_label != 1)
516 sys_error (r, _("Variable label indicator field is not 0 or 1."));
517 if (has_variable_label == 1)
522 len = read_int32 (r);
523 if (len >= sizeof label)
524 sys_error (r, _("Variable %s has label of invalid length %u."),
525 name, (unsigned int) len);
526 read_string (r, label, len + 1);
527 var_set_label (var, label);
529 skip_bytes (r, ROUND_UP (len, 4) - len);
532 /* Set missing values. */
533 if (missing_value_code < -3 || missing_value_code > 3
534 || missing_value_code == -1)
535 sys_error (r, _("Missing value indicator field is not "
536 "-3, -2, 0, 1, 2, or 3."));
537 if (missing_value_code != 0)
539 struct missing_values mv;
540 mv_init (&mv, var_get_width (var));
541 if (var_is_numeric (var))
543 if (missing_value_code > 0)
546 for (i = 0; i < missing_value_code; i++)
547 mv_add_num (&mv, read_flt64 (r));
551 double low = read_flt64 (r);
552 double high = read_flt64 (r);
553 mv_add_num_range (&mv, low, high);
554 if (missing_value_code == -3)
555 mv_add_num (&mv, read_flt64 (r));
558 else if (var_get_width (var) <= MAX_SHORT_STRING)
560 if (missing_value_code > 0)
563 for (i = 0; i < missing_value_code; i++)
566 read_string (r, string, sizeof string);
567 mv_add_str (&mv, string);
571 sys_error (r, _("String variable %s may not have missing "
572 "values specified as a range."),
575 else /* var->width > MAX_SHORT_STRING */
576 sys_error (r, _("Long string variable %s may not have missing "
579 var_set_missing_values (var, &mv);
583 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
584 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
586 /* Account for values.
587 Skip long string continuation records, if any. */
588 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
594 for (i = 1; i < nv; i++)
596 /* Check for record type 2 and width -1. */
597 if (read_int32 (r) != 2 || read_int32 (r) != -1)
598 sys_error (r, _("Missing string continuation record."));
600 /* Skip and ignore remaining continuation data. */
601 has_variable_label = read_int32 (r);
602 missing_value_code = read_int32 (r);
603 print_format = read_int32 (r);
604 write_format = read_int32 (r);
605 read_string (r, name, sizeof name);
607 /* Variable label fields on continuation records have
608 been spotted in system files created by "SPSS Power
609 Macintosh Release 6.1". */
610 if (has_variable_label)
611 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
616 /* Translates the format spec from sysfile format to internal
619 parse_format_spec (struct sfm_reader *r, uint32_t s,
620 enum which_format which, struct variable *v,
621 int *format_warning_cnt)
623 const int max_format_warnings = 8;
625 uint8_t raw_type = s >> 16;
631 if (!fmt_from_io (raw_type, &f.type))
632 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
637 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
642 if (which == PRINT_FORMAT)
643 var_set_print_format (v, &f);
645 var_set_write_format (v, &f);
647 else if (*++format_warning_cnt <= max_format_warnings)
649 char fmt_string[FMT_STRING_LEN_MAX + 1];
650 sys_warn (r, _("%s variable %s has invalid %s format %s."),
651 var_is_numeric (v) ? _("Numeric") : _("String"),
653 which == PRINT_FORMAT ? _("print") : _("write"),
654 fmt_to_string (&f, fmt_string));
656 if (*format_warning_cnt == max_format_warnings)
657 sys_warn (r, _("Suppressing further invalid format warnings."));
661 /* Sets the weighting variable in DICT to the variable
662 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
665 setup_weight (struct sfm_reader *r, int weight_idx,
666 struct variable **var_by_value_idx, struct dictionary *dict)
670 struct variable *weight_var
671 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
672 if (var_is_numeric (weight_var))
673 dict_set_weight (dict, weight_var);
675 sys_error (r, _("Weighting variable must be numeric."));
679 /* Reads a document record, type 6, from system file R, and sets up
680 the documents and n_documents fields in the associated
683 read_documents (struct sfm_reader *r, struct dictionary *dict)
688 if (dict_get_documents (dict) != NULL)
689 sys_error (r, _("Multiple type 6 (document) records."));
691 line_cnt = read_int32 (r);
693 sys_error (r, _("Number of document lines (%d) "
694 "must be greater than 0."), line_cnt);
696 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
697 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
698 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
699 dict_set_documents (dict, documents);
701 sys_error (r, _("Document line contains null byte."));
702 pool_free (r->pool, documents);
705 /* Read a type 7 extension record. */
707 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
709 int subtype = read_int32 (r);
710 size_t size = read_int32 (r);
711 size_t count = read_int32 (r);
712 size_t bytes = size * count;
714 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
715 allows an extra byte for a null terminator, used by some
716 extension processing routines. */
717 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
718 sys_error (r, "Record type 7 subtype %d too large.", subtype);
723 read_machine_int32_info (r, size, count);
727 read_machine_flt64_info (r, size, count);
731 /* Variable sets information. We don't use these yet.
732 They only apply to GUIs; see VARSETS on the APPLY
733 DICTIONARY command in SPSS documentation. */
737 /* DATE variable information. We don't use it yet, but we
742 /* Unknown purpose. */
746 read_display_parameters (r, size, count, dict);
750 read_long_var_name_map (r, size, count, dict);
754 read_long_string_map (r, size, count, dict);
758 /* New in SPSS v14? Unknown purpose. */
762 /* Text field that defines variable attributes. New in
767 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
771 skip_bytes (r, bytes);
774 /* Read record type 7, subtype 3. */
776 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
778 int version_major UNUSED = read_int32 (r);
779 int version_minor UNUSED = read_int32 (r);
780 int version_revision UNUSED = read_int32 (r);
781 int machine_code UNUSED = read_int32 (r);
782 int float_representation = read_int32 (r);
783 int compression_code UNUSED = read_int32 (r);
784 int integer_representation = read_int32 (r);
785 int character_code UNUSED = read_int32 (r);
787 int expected_float_format;
788 int expected_integer_format;
790 if (size != 4 || count != 8)
791 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
793 (unsigned int) size, (unsigned int) count);
795 /* Check floating point format. */
796 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
797 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
798 expected_float_format = 1;
799 else if (r->float_format == FLOAT_Z_LONG)
800 expected_float_format = 2;
801 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
802 expected_float_format = 3;
805 if (float_representation != expected_float_format)
806 sys_error (r, _("Floating-point representation indicated by "
807 "system file (%d) differs from expected (%d)."),
808 r->float_format, expected_float_format);
810 /* Check integer format. */
811 if (r->integer_format == INTEGER_MSB_FIRST)
812 expected_integer_format = 1;
813 else if (r->integer_format == INTEGER_LSB_FIRST)
814 expected_integer_format = 2;
817 if (integer_representation != expected_integer_format)
819 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
820 sys_warn (r, _("Integer format indicated by system file (%s) "
821 "differs from expected (%s)."),
822 gettext (endian[integer_representation == 1]),
823 gettext (endian[expected_integer_format == 1]));
827 /* Read record type 7, subtype 4. */
829 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
831 double sysmis = read_flt64 (r);
832 double highest = read_flt64 (r);
833 double lowest = read_flt64 (r);
835 if (size != 8 || count != 3)
836 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
837 (unsigned int) size, (unsigned int) count);
839 if (sysmis != SYSMIS)
840 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
841 if (highest != HIGHEST)
842 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
843 if (lowest != LOWEST)
844 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
847 /* Read record type 7, subtype 11, which specifies how variables
848 should be displayed in GUI environments. */
850 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
851 struct dictionary *dict)
853 const size_t n_vars = count / 3 ;
857 if (count % 3 || n_vars != dict_get_var_cnt (dict))
858 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
859 (unsigned int) size, (unsigned int) count);
861 for (i = 0; i < n_vars; ++i)
863 int measure = read_int32 (r);
864 int width = read_int32 (r);
865 int align = read_int32 (r);
866 struct variable *v = dict_get_var (dict, i);
868 /* spss v14 sometimes seems to set string variables' measure to zero */
869 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
872 if (measure < 1 || measure > 3 || align < 0 || align > 2)
875 sys_warn (r, _("Invalid variable display parameters. "
876 "Default parameters substituted."));
881 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
882 : measure == 2 ? MEASURE_ORDINAL
884 var_set_display_width (v, width);
885 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
886 : align == 1 ? ALIGN_RIGHT
891 /* Reads record type 7, subtype 13, which gives the long name
892 that corresponds to each short name. Modifies variable names
893 in DICT accordingly. */
895 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
896 struct dictionary *dict)
898 struct variable_to_value_map *map;
899 struct variable *var;
903 map = open_variable_to_value_map (r, size * count);
904 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
907 char short_name[SHORT_NAME_LEN + 1];
908 strcpy (short_name, var_get_short_name (var));
910 /* Validate long name. */
911 if (!var_is_valid_name (long_name, false))
913 sys_warn (r, _("Long variable mapping from %s to invalid "
914 "variable name `%s'."),
915 var_get_name (var), long_name);
919 /* Identify any duplicates. */
920 if (strcasecmp (short_name, long_name)
921 && dict_lookup_var (dict, long_name) != NULL)
923 sys_warn (r, _("Duplicate long variable name `%s' "
924 "within system file."), long_name);
928 /* Set long name. Renaming a variable may clear the short
929 name, but we want to retain it, so re-set it
931 dict_rename_var (dict, var, long_name);
932 var_set_short_name (var, short_name);
934 close_variable_to_value_map (r, map);
935 r->has_long_var_names = true;
938 /* Reads record type 7, subtype 14, which gives the real length
939 of each very long string. Rearranges DICT accordingly. */
941 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
942 struct dictionary *dict)
944 struct variable_to_value_map *map;
945 struct variable *var;
951 map = open_variable_to_value_map (r, size * count);
952 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
955 long length, remaining_length;
959 length = strtol (length_s, NULL, 10);
960 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
962 sys_warn (r, _("%s listed as string of length %s "
964 var_get_name (var), length_s);
968 /* Group multiple variables into single variable
969 and delete all but the first. */
970 remaining_length = length;
971 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
972 if (idx < dict_get_var_cnt (dict))
973 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
974 EFFECTIVE_LONG_STRING_LENGTH);
976 sys_error (r, _("Very long string %s overflows dictionary."),
978 dict_delete_consecutive_vars (dict,
979 var_get_dict_index (var) + 1,
980 idx - var_get_dict_index (var) - 1);
982 /* Assign all the length to the first variable. */
983 var_set_width (var, length);
985 close_variable_to_value_map (r, map);
986 dict_compact_values (dict);
989 /* Reads value labels from sysfile H and inserts them into the
990 associated dictionary. */
992 read_value_labels (struct sfm_reader *r,
993 struct dictionary *dict, struct variable **var_by_value_idx)
995 struct pool *subpool;
999 char raw_value[8]; /* Value as uninterpreted bytes. */
1000 union value value; /* Value. */
1001 char *label; /* Null-terminated label string. */
1004 struct label *labels = NULL;
1005 int label_cnt; /* Number of labels. */
1007 struct variable **var = NULL; /* Associated variables. */
1008 int var_cnt; /* Number of associated variables. */
1012 subpool = pool_create_subpool (r->pool);
1014 /* Read the type 3 record and record its contents. We can't do
1015 much with the data yet because we don't know whether it is
1016 of numeric or string type. */
1018 /* Read number of labels. */
1019 label_cnt = read_int32 (r);
1021 if (label_cnt >= INT32_MAX / sizeof *labels)
1023 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1028 /* Read each value/label tuple into labels[]. */
1029 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1030 for (i = 0; i < label_cnt; i++)
1032 struct label *label = labels + i;
1033 unsigned char label_len;
1037 read_bytes (r, label->raw_value, sizeof label->raw_value);
1039 /* Read label length. */
1040 read_bytes (r, &label_len, sizeof label_len);
1041 padded_len = ROUND_UP (label_len + 1, 8);
1043 /* Read label, padding. */
1044 label->label = pool_alloc (subpool, padded_len + 1);
1045 read_bytes (r, label->label, padded_len - 1);
1046 label->label[label_len] = 0;
1049 /* Now, read the type 4 record that has the list of variables
1050 to which the value labels are to be applied. */
1052 /* Read record type of type 4 record. */
1053 if (read_int32 (r) != 4)
1054 sys_error (r, _("Variable index record (type 4) does not immediately "
1055 "follow value label record (type 3) as it should."));
1057 /* Read number of variables associated with value label from type 4
1059 var_cnt = read_int32 (r);
1060 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1061 sys_error (r, _("Number of variables associated with a value label (%d) "
1062 "is not between 1 and the number of variables (%u)."),
1063 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1065 /* Read the list of variables. */
1066 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1067 for (i = 0; i < var_cnt; i++)
1069 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1070 if (var_is_long_string (var[i]))
1071 sys_error (r, _("Value labels are not allowed on long string "
1072 "variables (%s)."), var_get_name (var[i]));
1075 /* Type check the variables. */
1076 for (i = 1; i < var_cnt; i++)
1077 if (var_get_type (var[i]) != var_get_type (var[0]))
1078 sys_error (r, _("Variables associated with value label are not all of "
1079 "identical type. Variable %s is %s, but variable "
1081 var_get_name (var[0]),
1082 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1083 var_get_name (var[i]),
1084 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1086 /* Fill in labels[].value, now that we know the desired type. */
1087 for (i = 0; i < label_cnt; i++)
1089 struct label *label = labels + i;
1091 if (var_is_alpha (var[0]))
1092 buf_copy_rpad (label->value.s, sizeof label->value.s,
1093 label->raw_value, sizeof label->raw_value);
1095 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1098 /* Assign the `value_label's to each variable. */
1099 for (i = 0; i < var_cnt; i++)
1101 struct variable *v = var[i];
1104 /* Add each label to the variable. */
1105 for (j = 0; j < label_cnt; j++)
1107 struct label *label = &labels[j];
1108 if (!var_add_value_label (v, &label->value, label->label))
1110 if (var_is_numeric (var[0]))
1111 sys_warn (r, _("Duplicate value label for %g on %s."),
1112 label->value.f, var_get_name (v));
1114 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1115 var_get_width (v), label->value.s,
1121 pool_destroy (subpool);
1126 static void partial_record (struct sfm_reader *r)
1129 static void read_error (struct casereader *, const struct sfm_reader *);
1132 static bool read_case_number (struct sfm_reader *, double *);
1133 static bool read_case_string (struct sfm_reader *, char *, size_t);
1134 static int read_opcode (struct sfm_reader *);
1135 static bool read_compressed_number (struct sfm_reader *, double *);
1136 static bool read_compressed_string (struct sfm_reader *, char *);
1137 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1139 /* Reads one case from READER's file into C. Returns true only
1142 sys_file_casereader_read (struct casereader *reader, void *r_,
1145 struct sfm_reader *r = r_;
1149 case_create (c, r->value_cnt);
1150 if (setjmp (r->bail_out))
1152 casereader_force_error (reader);
1157 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1159 /* Fast path. Read the whole case directly. */
1160 if (!try_read_bytes (r, case_data_all_rw (c),
1161 sizeof (union value) * r->flt64_cnt))
1164 if ( r->case_cnt != -1 )
1165 read_error (reader, r);
1169 /* Convert floating point numbers to native format if needed. */
1170 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1174 for (i = 0; i < r->var_cnt; i++)
1175 if (r->vars[i].width == 0)
1177 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1178 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1185 /* Slow path. Convert from external to internal format. */
1188 for (i = 0; i < r->var_cnt; i++)
1190 struct sfm_var *sv = &r->vars[i];
1191 union value *v = case_data_rw_idx (c, sv->case_index);
1195 if (!read_case_number (r, &v->f))
1200 /* Read the string data in segments up to 255 bytes
1201 at a time, packed into 8-byte units. */
1202 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1203 int ofs, chunk_size;
1204 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1206 chunk_size = MIN (max_chunk, sv->width - ofs);
1207 if (!read_case_string (r, v->s + ofs, chunk_size))
1215 /* Very long strings have trailing wasted space
1216 that we must skip. */
1217 if (sv->width >= MIN_VERY_LONG_STRING)
1219 int bytes_read = (sv->width / max_chunk * 256
1220 + ROUND_UP (sv->width % max_chunk, 8));
1221 int total_bytes = sfm_width_to_bytes (sv->width);
1222 int excess_bytes = total_bytes - bytes_read;
1224 while (excess_bytes > 0)
1227 size_t chunk = MIN (sizeof buffer, excess_bytes);
1228 if (!read_whole_strings (r, buffer, chunk))
1230 excess_bytes -= chunk;
1241 if ( r->case_cnt != -1 )
1242 read_error (reader, r);
1247 /* Issues an error that R ends in a partial record. */
1249 partial_record (struct sfm_reader *r)
1251 sys_error (r, _("File ends in partial case."));
1255 read_error (struct casereader *r, const struct sfm_reader *sfm)
1257 msg (ME, _("Error reading case from file %s"), fh_get_name (sfm->fh));
1258 casereader_force_error (r);
1261 /* Reads a number from R and stores its value in *D.
1262 If R is compressed, reads a compressed number;
1263 otherwise, reads a number in the regular way.
1264 Returns true if successful, false if end of file is
1265 reached immediately. */
1267 read_case_number (struct sfm_reader *r, double *d)
1272 if (!try_read_bytes (r, flt64, sizeof flt64))
1274 *d = flt64_to_double (r, flt64);
1278 return read_compressed_number (r, d);
1281 /* Reads LENGTH string bytes from R into S.
1282 Always reads a multiple of 8 bytes; if LENGTH is not a
1283 multiple of 8, then extra bytes are read and discarded without
1285 Reads compressed strings if S is compressed.
1286 Returns true if successful, false if end of file is
1287 reached immediately. */
1289 read_case_string (struct sfm_reader *r, char *s, size_t length)
1291 size_t whole = ROUND_DOWN (length, 8);
1292 size_t partial = length % 8;
1296 if (!read_whole_strings (r, s, whole))
1303 if (!read_whole_strings (r, bounce, sizeof bounce))
1309 memcpy (s + whole, bounce, partial);
1315 /* Reads and returns the next compression opcode from R. */
1317 read_opcode (struct sfm_reader *r)
1319 assert (r->compressed);
1323 if (r->opcode_idx >= sizeof r->opcodes)
1325 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1329 opcode = r->opcodes[r->opcode_idx++];
1336 /* Reads a compressed number from R and stores its value in D.
1337 Returns true if successful, false if end of file is
1338 reached immediately. */
1340 read_compressed_number (struct sfm_reader *r, double *d)
1342 int opcode = read_opcode (r);
1350 *d = read_flt64 (r);
1354 sys_error (r, _("Compressed data is corrupt."));
1361 *d = opcode - r->bias;
1368 /* Reads a compressed 8-byte string segment from R and stores it
1370 Returns true if successful, false if end of file is
1371 reached immediately. */
1373 read_compressed_string (struct sfm_reader *r, char *dst)
1375 switch (read_opcode (r))
1382 read_bytes (r, dst, 8);
1386 memset (dst, ' ', 8);
1390 sys_error (r, _("Compressed data is corrupt."));
1396 /* Reads LENGTH string bytes from R into S.
1397 LENGTH must be a multiple of 8.
1398 Reads compressed strings if S is compressed.
1399 Returns true if successful, false if end of file is
1400 reached immediately. */
1402 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1404 assert (length % 8 == 0);
1406 return try_read_bytes (r, s, length);
1410 for (ofs = 0; ofs < length; ofs += 8)
1411 if (!read_compressed_string (r, s + ofs))
1421 /* Creates and returns a table that can be used for translating a value
1422 index into a case to a "struct variable *" for DICT. Multiple
1423 system file fields reference variables this way.
1425 This table must be created before processing the very long
1426 string extension record, because that record causes some
1427 values to be deleted from the case and the dictionary to be
1429 static struct variable **
1430 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1432 struct variable **var_by_value_idx;
1436 var_by_value_idx = pool_nmalloc (r->pool,
1437 r->flt64_cnt, sizeof *var_by_value_idx);
1438 for (i = 0; i < dict_get_var_cnt (dict); i++)
1440 struct variable *v = dict_get_var (dict, i);
1441 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1444 var_by_value_idx[value_idx++] = v;
1445 for (j = 1; j < nv; j++)
1446 var_by_value_idx[value_idx++] = NULL;
1448 assert (value_idx == r->flt64_cnt);
1450 return var_by_value_idx;
1453 /* Returns the "struct variable" corresponding to the given
1454 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1456 static struct variable *
1457 lookup_var_by_value_idx (struct sfm_reader *r,
1458 struct variable **var_by_value_idx, int value_idx)
1460 struct variable *var;
1462 if (value_idx < 1 || value_idx > r->flt64_cnt)
1463 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1464 value_idx, r->flt64_cnt);
1466 var = var_by_value_idx[value_idx - 1];
1468 sys_error (r, _("Variable index %d refers to long string "
1475 /* Returns the variable in D with the given SHORT_NAME,
1476 or a null pointer if there is none. */
1477 static struct variable *
1478 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1480 struct variable *var;
1484 /* First try looking up by full name. This often succeeds. */
1485 var = dict_lookup_var (d, short_name);
1486 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1489 /* Iterate through the whole dictionary as a fallback. */
1490 var_cnt = dict_get_var_cnt (d);
1491 for (i = 0; i < var_cnt; i++)
1493 var = dict_get_var (d, i);
1494 if (!strcasecmp (var_get_short_name (var), short_name))
1501 /* Helpers for reading records that contain "variable=value"
1505 struct variable_to_value_map
1507 struct substring buffer; /* Record contents. */
1508 size_t pos; /* Current position in buffer. */
1511 /* Reads SIZE bytes into a "variable=value" map for R,
1512 and returns the map. */
1513 static struct variable_to_value_map *
1514 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1516 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1517 char *buffer = pool_malloc (r->pool, size + 1);
1518 read_bytes (r, buffer, size);
1519 map->buffer = ss_buffer (buffer, size);
1524 /* Closes MAP and frees its storage.
1525 Not really needed, because the pool will free the map anyway,
1526 but can be used to free it earlier. */
1528 close_variable_to_value_map (struct sfm_reader *r,
1529 struct variable_to_value_map *map)
1531 pool_free (r->pool, ss_data (map->buffer));
1534 /* Reads the next variable=value pair from MAP.
1535 Looks up the variable in DICT and stores it into *VAR.
1536 Stores a null-terminated value into *VALUE. */
1538 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1539 struct variable_to_value_map *map,
1540 struct variable **var, char **value,
1543 int max_warnings = 5;
1547 struct substring short_name_ss, value_ss;
1549 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1550 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1553 if (*warning_cnt > max_warnings)
1554 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1555 *warning_cnt - max_warnings);
1559 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1560 ss_buffer ("\t\0", 2));
1562 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1563 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1566 if (++*warning_cnt <= 5)
1567 sys_warn (r, _("Variable map refers to unknown variable %s."),
1568 ss_data (short_name_ss));
1572 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1573 *value = ss_data (value_ss);
1581 /* Displays a corruption message. */
1583 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1588 ds_init_empty (&text);
1589 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1590 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1591 ds_put_vformat (&text, format, args);
1593 m.category = msg_class_to_category (class);
1594 m.severity = msg_class_to_severity (class);
1595 m.where.file_name = NULL;
1596 m.where.line_number = 0;
1597 m.text = ds_cstr (&text);
1602 /* Displays a warning for the current file position. */
1604 sys_warn (struct sfm_reader *r, const char *format, ...)
1608 va_start (args, format);
1609 sys_msg (r, MW, format, args);
1613 /* Displays an error for the current file position,
1614 marks it as in an error state,
1615 and aborts reading it using longjmp. */
1617 sys_error (struct sfm_reader *r, const char *format, ...)
1621 va_start (args, format);
1622 sys_msg (r, ME, format, args);
1626 longjmp (r->bail_out, 1);
1629 /* Reads BYTE_CNT bytes into BUF.
1630 Returns true if exactly BYTE_CNT bytes are successfully read.
1631 Aborts if an I/O error or a partial read occurs.
1632 If EOF_IS_OK, then an immediate end-of-file causes false to be
1633 returned; otherwise, immediate end-of-file causes an abort
1636 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1637 void *buf, size_t byte_cnt)
1639 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1640 if (bytes_read == byte_cnt)
1642 else if (ferror (r->file))
1643 sys_error (r, _("System error: %s."), strerror (errno));
1644 else if (!eof_is_ok || bytes_read != 0)
1645 sys_error (r, _("Unexpected end of file."));
1650 /* Reads BYTE_CNT into BUF.
1651 Aborts upon I/O error or if end-of-file is encountered. */
1653 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1655 read_bytes_internal (r, false, buf, byte_cnt);
1658 /* Reads BYTE_CNT bytes into BUF.
1659 Returns true if exactly BYTE_CNT bytes are successfully read.
1660 Returns false if an immediate end-of-file is encountered.
1661 Aborts if an I/O error or a partial read occurs. */
1663 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1665 return read_bytes_internal (r, true, buf, byte_cnt);
1668 /* Reads a 32-bit signed integer from R and returns its value in
1671 read_int32 (struct sfm_reader *r)
1674 read_bytes (r, int32, sizeof int32);
1675 return int32_to_native (r, int32);
1678 /* Reads a 64-bit floating-point number from R and returns its
1679 value in host format. */
1681 read_flt64 (struct sfm_reader *r)
1684 read_bytes (r, flt64, sizeof flt64);
1685 return flt64_to_double (r, flt64);
1688 /* Reads exactly SIZE - 1 bytes into BUFFER
1689 and stores a null byte into BUFFER[SIZE - 1]. */
1691 read_string (struct sfm_reader *r, char *buffer, size_t size)
1694 read_bytes (r, buffer, size - 1);
1695 buffer[size - 1] = '\0';
1698 /* Skips BYTES bytes forward in R. */
1700 skip_bytes (struct sfm_reader *r, size_t bytes)
1705 size_t chunk = MIN (sizeof buffer, bytes);
1706 read_bytes (r, buffer, chunk);
1711 /* Returns the value of the 32-bit signed integer at INT32,
1712 converted from the format used by R to the host format. */
1714 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1717 if (r->integer_format == INTEGER_NATIVE)
1718 memcpy (&x, int32, sizeof x);
1720 x = integer_get (r->integer_format, int32, sizeof x);
1724 /* Returns the value of the 64-bit floating point number at
1725 FLT64, converted from the format used by R to the host
1728 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1731 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1732 memcpy (&x, flt64, sizeof x);
1734 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);
1738 static struct casereader_class sys_file_casereader_class =
1740 sys_file_casereader_read,
1741 sys_file_casereader_destroy,