1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include <data/sys-file-reader.h>
22 #include <data/sys-file-private.h>
30 #include <libpspp/alloc.h>
31 #include <libpspp/assertion.h>
32 #include <libpspp/message.h>
33 #include <libpspp/compiler.h>
34 #include <libpspp/magic.h>
35 #include <libpspp/misc.h>
36 #include <libpspp/pool.h>
37 #include <libpspp/str.h>
38 #include <libpspp/hash.h>
39 #include <libpspp/array.h>
41 #include <data/case.h>
42 #include <data/casereader-provider.h>
43 #include <data/casereader.h>
44 #include <data/dictionary.h>
45 #include <data/file-handle-def.h>
46 #include <data/file-name.h>
47 #include <data/format.h>
48 #include <data/missing-values.h>
49 #include <data/value-labels.h>
50 #include <data/variable.h>
51 #include <data/value.h>
56 #include "unlocked-io.h"
60 #define _(msgid) gettext (msgid)
61 #define N_(msgid) (msgid)
63 /* System file reader. */
66 /* Resource tracking. */
67 struct pool *pool; /* All system file state. */
68 jmp_buf bail_out; /* longjmp() target for error handling. */
71 struct file_handle *fh; /* File handle. */
72 FILE *file; /* File stream. */
73 bool error; /* I/O or corruption error? */
74 size_t value_cnt; /* Number of "union value"s in struct case. */
77 enum integer_format integer_format; /* On-disk integer format. */
78 enum float_format float_format; /* On-disk floating point format. */
79 int flt64_cnt; /* Number of 8-byte units per case. */
80 struct sfm_var *vars; /* Variables. */
81 size_t var_cnt; /* Number of variables. */
82 bool has_long_var_names; /* File has a long variable name map */
83 bool has_vls; /* File has one or more very long strings? */
86 bool compressed; /* File is compressed? */
87 double bias; /* Compression bias, usually 100.0. */
88 uint8_t opcodes[8]; /* Current block of opcodes. */
89 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
92 /* A variable in a system file. */
95 int width; /* 0=numeric, otherwise string width. */
96 int case_index; /* Index into case. */
99 static struct casereader_class sys_file_casereader_class;
101 static bool close_reader (struct sfm_reader *);
103 static struct variable **make_var_by_value_idx (struct sfm_reader *,
104 struct dictionary *);
105 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
109 static void sys_warn (struct sfm_reader *, const char *, ...)
110 PRINTF_FORMAT (2, 3);
112 static void sys_error (struct sfm_reader *, const char *, ...)
116 static void read_bytes (struct sfm_reader *, void *, size_t);
117 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
118 static int32_t read_int32 (struct sfm_reader *);
119 static double read_flt64 (struct sfm_reader *);
120 static void read_string (struct sfm_reader *, char *, size_t);
121 static void skip_bytes (struct sfm_reader *, size_t);
123 static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
124 static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
126 static struct variable_to_value_map *open_variable_to_value_map (
127 struct sfm_reader *, size_t size);
128 static void close_variable_to_value_map (struct sfm_reader *r,
129 struct variable_to_value_map *);
130 static bool read_variable_to_value_map (struct sfm_reader *,
132 struct variable_to_value_map *,
133 struct variable **var, char **value,
136 static bool close_reader (struct sfm_reader *r);
138 /* Dictionary reader. */
146 static void read_header (struct sfm_reader *, struct dictionary *,
147 int *weight_idx, int *claimed_flt64_cnt,
148 struct sfm_read_info *);
149 static void read_variable_record (struct sfm_reader *, struct dictionary *,
150 int *format_warning_cnt);
151 static void parse_format_spec (struct sfm_reader *, uint32_t,
152 enum which_format, struct variable *,
153 int *format_warning_cnt);
154 static void setup_weight (struct sfm_reader *, int weight_idx,
155 struct variable **var_by_value_idx,
156 struct dictionary *);
157 static void read_documents (struct sfm_reader *, struct dictionary *);
158 static void read_value_labels (struct sfm_reader *, struct dictionary *,
159 struct variable **var_by_value_idx);
161 static void read_extension_record (struct sfm_reader *, struct dictionary *);
162 static void read_machine_int32_info (struct sfm_reader *,
163 size_t size, size_t count);
164 static void read_machine_flt64_info (struct sfm_reader *,
165 size_t size, size_t count);
166 static void read_display_parameters (struct sfm_reader *,
167 size_t size, size_t count,
168 struct dictionary *);
169 static void read_long_var_name_map (struct sfm_reader *,
170 size_t size, size_t count,
171 struct dictionary *);
172 static void read_long_string_map (struct sfm_reader *,
173 size_t size, size_t count,
174 struct dictionary *);
177 /* Opens the system file designated by file handle FH for
178 reading. Reads the system file's dictionary into *DICT.
179 If INFO is non-null, then it receives additional info about the
182 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
183 struct sfm_read_info *info)
185 struct sfm_reader *volatile r = NULL;
186 struct variable **var_by_value_idx;
187 int format_warning_cnt = 0;
189 int claimed_flt64_cnt;
193 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
196 *dict = dict_create ();
198 /* Create and initialize reader. */
199 r = pool_create_container (struct sfm_reader, pool);
201 r->file = fn_open (fh_get_file_name (fh), "rb");
205 r->has_long_var_names = false;
206 r->opcode_idx = sizeof r->opcodes;
208 if (setjmp (r->bail_out))
211 dict_destroy (*dict);
218 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
219 fh_get_file_name (r->fh), strerror (errno));
220 longjmp (r->bail_out, 1);
224 read_header (r, *dict, &weight_idx, &claimed_flt64_cnt, info);
226 /* Read all the variable definition records. */
227 rec_type = read_int32 (r);
228 while (rec_type == 2)
230 read_variable_record (r, *dict, &format_warning_cnt);
231 rec_type = read_int32 (r);
234 /* Figure out the case format. */
235 var_by_value_idx = make_var_by_value_idx (r, *dict);
236 setup_weight (r, weight_idx, var_by_value_idx, *dict);
238 /* Read all the rest of the dictionary records. */
239 while (rec_type != 999)
244 read_value_labels (r, *dict, var_by_value_idx);
248 sys_error (r, _("Misplaced type 4 record."));
251 read_documents (r, *dict);
255 read_extension_record (r, *dict);
259 sys_error (r, _("Unrecognized record type %d."), rec_type);
261 rec_type = read_int32 (r);
265 if ( ! r->has_long_var_names )
268 for (i = 0; i < dict_get_var_cnt (*dict); i++)
270 struct variable *var = dict_get_var (*dict, i);
271 char short_name [SHORT_NAME_LEN + 1];
272 char long_name [SHORT_NAME_LEN + 1];
274 strcpy (short_name, var_get_name (var));
276 strcpy (long_name, short_name);
277 str_lowercase (long_name);
279 /* Set long name. Renaming a variable may clear the short
280 name, but we want to retain it, so re-set it
282 dict_rename_var (*dict, var, long_name);
283 var_set_short_name (var, short_name);
286 r->has_long_var_names = true;
289 /* Read record 999 data, which is just filler. */
292 if (claimed_flt64_cnt != -1 && claimed_flt64_cnt != r->flt64_cnt)
293 sys_warn (r, _("File header claims %d variable positions but "
294 "%d were read from file."),
295 claimed_flt64_cnt, r->flt64_cnt);
297 /* Create an index of dictionary variable widths for
298 sfm_read_case to use. We cannot use the `struct variable's
299 from the dictionary we created, because the caller owns the
300 dictionary and may destroy or modify its variables. */
301 r->var_cnt = dict_get_var_cnt (*dict);
302 r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
303 for (i = 0; i < r->var_cnt; i++)
305 struct variable *v = dict_get_var (*dict, i);
306 struct sfm_var *sv = &r->vars[i];
307 sv->width = var_get_width (v);
308 sv->case_index = var_get_case_index (v);
311 pool_free (r->pool, var_by_value_idx);
312 r->value_cnt = dict_get_next_value_idx (*dict);
313 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
314 &sys_file_casereader_class, r);
317 /* Closes a system file after we're done with it.
318 Returns true if an I/O error has occurred on READER, false
321 close_reader (struct sfm_reader *r)
330 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
332 msg (ME, _("Error closing system file \"%s\": %s."),
333 fh_get_file_name (r->fh), strerror (errno));
340 fh_close (r->fh, "system file", "rs");
343 pool_destroy (r->pool);
348 /* Destroys READER. */
350 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
352 struct sfm_reader *r = r_;
356 /* Returns true if FILE is an SPSS system file,
359 sfm_detect (FILE *file)
363 if (fread (rec_type, 4, 1, file) != 1)
367 return !strcmp ("$FL2", rec_type);
370 /* Reads the global header of the system file.
371 Sets DICT's file label to the system file's label.
372 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
373 or to the value index of the weight variable otherwise.
374 Sets *CLAIMED_FLT64_CNT to the number of values that the file
375 claims to have (although it is not always correct).
376 If INFO is non-null, initializes *INFO with header
379 read_header (struct sfm_reader *r, struct dictionary *dict,
380 int *weight_idx, int *claimed_flt64_cnt,
381 struct sfm_read_info *info)
384 char eye_catcher[61];
385 uint8_t raw_layout_code[4];
388 char creation_date[10];
389 char creation_time[9];
391 struct substring file_label_ss;
393 read_string (r, rec_type, sizeof rec_type);
394 read_string (r, eye_catcher, sizeof eye_catcher);
396 if (strcmp ("$FL2", rec_type) != 0)
397 sys_error (r, _("This is not an SPSS system file."));
399 /* Identify integer format. */
400 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
401 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
403 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
405 || (r->integer_format != INTEGER_MSB_FIRST
406 && r->integer_format != INTEGER_LSB_FIRST))
407 sys_error (r, _("This is not an SPSS system file."));
409 *claimed_flt64_cnt = read_int32 (r);
410 if (*claimed_flt64_cnt < 0 || *claimed_flt64_cnt > INT_MAX / 16)
411 *claimed_flt64_cnt = -1;
413 r->compressed = read_int32 (r) != 0;
415 *weight_idx = read_int32 (r);
417 case_cnt = read_int32 (r);
418 if (case_cnt < -1 || case_cnt > INT_MAX / 2)
421 /* Identify floating-point format and obtain compression bias. */
422 read_bytes (r, raw_bias, sizeof raw_bias);
423 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
425 sys_warn (r, _("Compression bias (%g) is not the usual "
426 "value of 100, or system file uses unrecognized "
427 "floating-point format."),
429 if (r->integer_format == INTEGER_MSB_FIRST)
430 r->float_format = FLOAT_IEEE_DOUBLE_BE;
432 r->float_format = FLOAT_IEEE_DOUBLE_LE;
434 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
436 read_string (r, creation_date, sizeof creation_date);
437 read_string (r, creation_time, sizeof creation_time);
438 read_string (r, file_label, sizeof file_label);
441 file_label_ss = ss_cstr (file_label);
442 ss_trim (&file_label_ss, ss_cstr (" "));
443 if (!ss_is_empty (file_label_ss))
445 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
446 dict_set_label (dict, ss_data (file_label_ss));
451 struct substring product;
453 strcpy (info->creation_date, creation_date);
454 strcpy (info->creation_time, creation_time);
455 info->integer_format = r->integer_format;
456 info->float_format = r->float_format;
457 info->compressed = r->compressed;
458 info->case_cnt = case_cnt;
460 product = ss_cstr (eye_catcher);
461 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
462 ss_trim (&product, ss_cstr (" "));
463 str_copy_buf_trunc (info->product, sizeof info->product,
464 ss_data (product), ss_length (product));
468 /* Reads a variable (type 2) record from R and adds the
469 corresponding variable to DICT.
470 Also skips past additional variable records for long string
473 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
474 int *format_warning_cnt)
477 int has_variable_label;
478 int missing_value_code;
483 struct variable *var;
486 width = read_int32 (r);
487 has_variable_label = read_int32 (r);
488 missing_value_code = read_int32 (r);
489 print_format = read_int32 (r);
490 write_format = read_int32 (r);
491 read_string (r, name, sizeof name);
492 name[strcspn (name, " ")] = '\0';
494 /* Check variable name. */
495 if (name[0] == '$' || name[0] == '#')
496 sys_error (r, "Variable name begins with invalid character `%c'.",
498 if (!var_is_plausible_name (name, false))
499 sys_error (r, _("Invalid variable name `%s'."), name);
501 /* Create variable. */
502 if (width < 0 || width > 255)
503 sys_error (r, _("Bad variable width %d."), width);
504 var = dict_create_var (dict, name, width);
507 _("Duplicate variable name `%s' within system file."),
510 /* Set the short name the same as the long name */
511 var_set_short_name (var, var_get_name (var));
513 /* Get variable label, if any. */
514 if (has_variable_label != 0 && has_variable_label != 1)
515 sys_error (r, _("Variable label indicator field is not 0 or 1."));
516 if (has_variable_label == 1)
521 len = read_int32 (r);
522 if (len >= sizeof label)
523 sys_error (r, _("Variable %s has label of invalid length %u."),
524 name, (unsigned int) len);
525 read_string (r, label, len + 1);
526 var_set_label (var, label);
528 skip_bytes (r, ROUND_UP (len, 4) - len);
531 /* Set missing values. */
532 if (missing_value_code < -3 || missing_value_code > 3
533 || missing_value_code == -1)
534 sys_error (r, _("Missing value indicator field is not "
535 "-3, -2, 0, 1, 2, or 3."));
536 if (missing_value_code != 0)
538 struct missing_values mv;
539 mv_init (&mv, var_get_width (var));
540 if (var_is_numeric (var))
542 if (missing_value_code > 0)
545 for (i = 0; i < missing_value_code; i++)
546 mv_add_num (&mv, read_flt64 (r));
550 double low = read_flt64 (r);
551 double high = read_flt64 (r);
552 mv_add_num_range (&mv, low, high);
553 if (missing_value_code == -3)
554 mv_add_num (&mv, read_flt64 (r));
557 else if (var_get_width (var) <= MAX_SHORT_STRING)
559 if (missing_value_code > 0)
562 for (i = 0; i < missing_value_code; i++)
565 read_string (r, string, sizeof string);
566 mv_add_str (&mv, string);
570 sys_error (r, _("String variable %s may not have missing "
571 "values specified as a range."),
574 else /* var->width > MAX_SHORT_STRING */
575 sys_error (r, _("Long string variable %s may not have missing "
578 var_set_missing_values (var, &mv);
582 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
583 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
585 /* Account for values.
586 Skip long string continuation records, if any. */
587 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
593 for (i = 1; i < nv; i++)
595 /* Check for record type 2 and width -1. */
596 if (read_int32 (r) != 2 || read_int32 (r) != -1)
597 sys_error (r, _("Missing string continuation record."));
599 /* Skip and ignore remaining continuation data. */
600 has_variable_label = read_int32 (r);
601 missing_value_code = read_int32 (r);
602 print_format = read_int32 (r);
603 write_format = read_int32 (r);
604 read_string (r, name, sizeof name);
606 /* Variable label fields on continuation records have
607 been spotted in system files created by "SPSS Power
608 Macintosh Release 6.1". */
609 if (has_variable_label)
610 skip_bytes (r, ROUND_UP (read_int32 (r), 4));
615 /* Translates the format spec from sysfile format to internal
618 parse_format_spec (struct sfm_reader *r, uint32_t s,
619 enum which_format which, struct variable *v,
620 int *format_warning_cnt)
622 const int max_format_warnings = 8;
624 uint8_t raw_type = s >> 16;
630 if (!fmt_from_io (raw_type, &f.type))
631 sys_error (r, _("Unknown variable format %d."), (int) raw_type);
636 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
641 if (which == PRINT_FORMAT)
642 var_set_print_format (v, &f);
644 var_set_write_format (v, &f);
646 else if (*++format_warning_cnt <= max_format_warnings)
648 char fmt_string[FMT_STRING_LEN_MAX + 1];
649 sys_warn (r, _("%s variable %s has invalid %s format %s."),
650 var_is_numeric (v) ? _("Numeric") : _("String"),
652 which == PRINT_FORMAT ? _("print") : _("write"),
653 fmt_to_string (&f, fmt_string));
655 if (*format_warning_cnt == max_format_warnings)
656 sys_warn (r, _("Suppressing further invalid format warnings."));
660 /* Sets the weighting variable in DICT to the variable
661 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
664 setup_weight (struct sfm_reader *r, int weight_idx,
665 struct variable **var_by_value_idx, struct dictionary *dict)
669 struct variable *weight_var
670 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
671 if (var_is_numeric (weight_var))
672 dict_set_weight (dict, weight_var);
674 sys_error (r, _("Weighting variable must be numeric."));
678 /* Reads a document record, type 6, from system file R, and sets up
679 the documents and n_documents fields in the associated
682 read_documents (struct sfm_reader *r, struct dictionary *dict)
687 if (dict_get_documents (dict) != NULL)
688 sys_error (r, _("Multiple type 6 (document) records."));
690 line_cnt = read_int32 (r);
692 sys_error (r, _("Number of document lines (%d) "
693 "must be greater than 0."), line_cnt);
695 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
696 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
697 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
698 dict_set_documents (dict, documents);
700 sys_error (r, _("Document line contains null byte."));
701 pool_free (r->pool, documents);
704 /* Read a type 7 extension record. */
706 read_extension_record (struct sfm_reader *r, struct dictionary *dict)
708 int subtype = read_int32 (r);
709 size_t size = read_int32 (r);
710 size_t count = read_int32 (r);
711 size_t bytes = size * count;
713 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
714 allows an extra byte for a null terminator, used by some
715 extension processing routines. */
716 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
717 sys_error (r, "Record type 7 subtype %d too large.", subtype);
722 read_machine_int32_info (r, size, count);
726 read_machine_flt64_info (r, size, count);
730 /* Variable sets information. We don't use these yet.
731 They only apply to GUIs; see VARSETS on the APPLY
732 DICTIONARY command in SPSS documentation. */
736 /* DATE variable information. We don't use it yet, but we
741 /* Unknown purpose. */
745 read_display_parameters (r, size, count, dict);
749 read_long_var_name_map (r, size, count, dict);
753 read_long_string_map (r, size, count, dict);
757 /* New in SPSS v14? Unknown purpose. */
761 /* Text field that defines variable attributes. New in
766 sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
770 skip_bytes (r, bytes);
773 /* Read record type 7, subtype 3. */
775 read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
777 int version_major UNUSED = read_int32 (r);
778 int version_minor UNUSED = read_int32 (r);
779 int version_revision UNUSED = read_int32 (r);
780 int machine_code UNUSED = read_int32 (r);
781 int float_representation = read_int32 (r);
782 int compression_code UNUSED = read_int32 (r);
783 int integer_representation = read_int32 (r);
784 int character_code UNUSED = read_int32 (r);
786 int expected_float_format;
787 int expected_integer_format;
789 if (size != 4 || count != 8)
790 sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
792 (unsigned int) size, (unsigned int) count);
794 /* Check floating point format. */
795 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
796 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
797 expected_float_format = 1;
798 else if (r->float_format == FLOAT_Z_LONG)
799 expected_float_format = 2;
800 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
801 expected_float_format = 3;
804 if (float_representation != expected_float_format)
805 sys_error (r, _("Floating-point representation indicated by "
806 "system file (%d) differs from expected (%d)."),
807 r->float_format, expected_float_format);
809 /* Check integer format. */
810 if (r->integer_format == INTEGER_MSB_FIRST)
811 expected_integer_format = 1;
812 else if (r->integer_format == INTEGER_LSB_FIRST)
813 expected_integer_format = 2;
816 if (integer_representation != expected_integer_format)
818 static const char *endian[] = {N_("little-endian"), N_("big-endian")};
819 sys_warn (r, _("Integer format indicated by system file (%s) "
820 "differs from expected (%s)."),
821 gettext (endian[integer_representation == 1]),
822 gettext (endian[expected_integer_format == 1]));
826 /* Read record type 7, subtype 4. */
828 read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
830 double sysmis = read_flt64 (r);
831 double highest = read_flt64 (r);
832 double lowest = read_flt64 (r);
834 if (size != 8 || count != 3)
835 sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
836 (unsigned int) size, (unsigned int) count);
838 if (sysmis != SYSMIS)
839 sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
840 if (highest != HIGHEST)
841 sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
842 if (lowest != LOWEST)
843 sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
846 /* Read record type 7, subtype 11, which specifies how variables
847 should be displayed in GUI environments. */
849 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
850 struct dictionary *dict)
852 const size_t n_vars = count / 3 ;
856 if (count % 3 || n_vars != dict_get_var_cnt (dict))
857 sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
858 (unsigned int) size, (unsigned int) count);
860 for (i = 0; i < n_vars; ++i)
862 int measure = read_int32 (r);
863 int width = read_int32 (r);
864 int align = read_int32 (r);
865 struct variable *v = dict_get_var (dict, i);
867 /* spss v14 sometimes seems to set string variables' measure to zero */
868 if ( 0 == measure && var_is_alpha (v) ) measure = 1;
871 if (measure < 1 || measure > 3 || align < 0 || align > 2)
874 sys_warn (r, _("Invalid variable display parameters. "
875 "Default parameters substituted."));
880 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
881 : measure == 2 ? MEASURE_ORDINAL
883 var_set_display_width (v, width);
884 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
885 : align == 1 ? ALIGN_RIGHT
890 /* Reads record type 7, subtype 13, which gives the long name
891 that corresponds to each short name. Modifies variable names
892 in DICT accordingly. */
894 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
895 struct dictionary *dict)
897 struct variable_to_value_map *map;
898 struct variable *var;
902 map = open_variable_to_value_map (r, size * count);
903 while (read_variable_to_value_map (r, dict, map, &var, &long_name,
906 char short_name[SHORT_NAME_LEN + 1];
907 strcpy (short_name, var_get_short_name (var));
909 /* Validate long name. */
910 if (!var_is_valid_name (long_name, false))
912 sys_warn (r, _("Long variable mapping from %s to invalid "
913 "variable name `%s'."),
914 var_get_name (var), long_name);
918 /* Identify any duplicates. */
919 if (strcasecmp (short_name, long_name)
920 && dict_lookup_var (dict, long_name) != NULL)
922 sys_warn (r, _("Duplicate long variable name `%s' "
923 "within system file."), long_name);
927 /* Set long name. Renaming a variable may clear the short
928 name, but we want to retain it, so re-set it
930 dict_rename_var (dict, var, long_name);
931 var_set_short_name (var, short_name);
933 close_variable_to_value_map (r, map);
934 r->has_long_var_names = true;
937 /* Reads record type 7, subtype 14, which gives the real length
938 of each very long string. Rearranges DICT accordingly. */
940 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
941 struct dictionary *dict)
943 struct variable_to_value_map *map;
944 struct variable *var;
950 map = open_variable_to_value_map (r, size * count);
951 while (read_variable_to_value_map (r, dict, map, &var, &length_s,
954 long length, remaining_length;
958 length = strtol (length_s, NULL, 10);
959 if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
961 sys_warn (r, _("%s listed as string of length %s "
963 var_get_name (var), length_s);
967 /* Group multiple variables into single variable
968 and delete all but the first. */
969 remaining_length = length;
970 for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
971 if (idx < dict_get_var_cnt (dict))
972 remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
973 EFFECTIVE_LONG_STRING_LENGTH);
975 sys_error (r, _("Very long string %s overflows dictionary."),
977 dict_delete_consecutive_vars (dict,
978 var_get_dict_index (var) + 1,
979 idx - var_get_dict_index (var) - 1);
981 /* Assign all the length to the first variable. */
982 var_set_width (var, length);
984 close_variable_to_value_map (r, map);
985 dict_compact_values (dict);
988 /* Reads value labels from sysfile H and inserts them into the
989 associated dictionary. */
991 read_value_labels (struct sfm_reader *r,
992 struct dictionary *dict, struct variable **var_by_value_idx)
994 struct pool *subpool;
998 char raw_value[8]; /* Value as uninterpreted bytes. */
999 union value value; /* Value. */
1000 char *label; /* Null-terminated label string. */
1003 struct label *labels = NULL;
1004 int label_cnt; /* Number of labels. */
1006 struct variable **var = NULL; /* Associated variables. */
1007 int var_cnt; /* Number of associated variables. */
1011 subpool = pool_create_subpool (r->pool);
1013 /* Read the type 3 record and record its contents. We can't do
1014 much with the data yet because we don't know whether it is
1015 of numeric or string type. */
1017 /* Read number of labels. */
1018 label_cnt = read_int32 (r);
1020 if (label_cnt >= INT32_MAX / sizeof *labels)
1022 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1027 /* Read each value/label tuple into labels[]. */
1028 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1029 for (i = 0; i < label_cnt; i++)
1031 struct label *label = labels + i;
1032 unsigned char label_len;
1036 read_bytes (r, label->raw_value, sizeof label->raw_value);
1038 /* Read label length. */
1039 read_bytes (r, &label_len, sizeof label_len);
1040 padded_len = ROUND_UP (label_len + 1, 8);
1042 /* Read label, padding. */
1043 label->label = pool_alloc (subpool, padded_len + 1);
1044 read_bytes (r, label->label, padded_len - 1);
1045 label->label[label_len] = 0;
1048 /* Now, read the type 4 record that has the list of variables
1049 to which the value labels are to be applied. */
1051 /* Read record type of type 4 record. */
1052 if (read_int32 (r) != 4)
1053 sys_error (r, _("Variable index record (type 4) does not immediately "
1054 "follow value label record (type 3) as it should."));
1056 /* Read number of variables associated with value label from type 4
1058 var_cnt = read_int32 (r);
1059 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1060 sys_error (r, _("Number of variables associated with a value label (%d) "
1061 "is not between 1 and the number of variables (%u)."),
1062 var_cnt, (unsigned int) dict_get_var_cnt (dict));
1064 /* Read the list of variables. */
1065 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1066 for (i = 0; i < var_cnt; i++)
1068 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
1069 if (var_is_long_string (var[i]))
1070 sys_error (r, _("Value labels are not allowed on long string "
1071 "variables (%s)."), var_get_name (var[i]));
1074 /* Type check the variables. */
1075 for (i = 1; i < var_cnt; i++)
1076 if (var_get_type (var[i]) != var_get_type (var[0]))
1077 sys_error (r, _("Variables associated with value label are not all of "
1078 "identical type. Variable %s is %s, but variable "
1080 var_get_name (var[0]),
1081 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1082 var_get_name (var[i]),
1083 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1085 /* Fill in labels[].value, now that we know the desired type. */
1086 for (i = 0; i < label_cnt; i++)
1088 struct label *label = labels + i;
1090 if (var_is_alpha (var[0]))
1091 buf_copy_rpad (label->value.s, sizeof label->value.s,
1092 label->raw_value, sizeof label->raw_value);
1094 label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
1097 /* Assign the `value_label's to each variable. */
1098 for (i = 0; i < var_cnt; i++)
1100 struct variable *v = var[i];
1103 /* Add each label to the variable. */
1104 for (j = 0; j < label_cnt; j++)
1106 struct label *label = &labels[j];
1107 if (!var_add_value_label (v, &label->value, label->label))
1109 if (var_is_numeric (var[0]))
1110 sys_warn (r, _("Duplicate value label for %g on %s."),
1111 label->value.f, var_get_name (v));
1113 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1114 var_get_width (v), label->value.s,
1120 pool_destroy (subpool);
1125 static void partial_record (struct sfm_reader *r)
1127 static bool read_case_number (struct sfm_reader *, double *);
1128 static bool read_case_string (struct sfm_reader *, char *, size_t);
1129 static int read_opcode (struct sfm_reader *);
1130 static bool read_compressed_number (struct sfm_reader *, double *);
1131 static bool read_compressed_string (struct sfm_reader *, char *);
1132 static bool read_whole_strings (struct sfm_reader *, char *, size_t);
1134 /* Reads one case from READER's file into C. Returns true only
1137 sys_file_casereader_read (struct casereader *reader, void *r_,
1140 struct sfm_reader *r = r_;
1144 case_create (c, r->value_cnt);
1145 if (setjmp (r->bail_out))
1147 casereader_force_error (reader);
1152 if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
1154 /* Fast path. Read the whole case directly. */
1155 if (!try_read_bytes (r, case_data_all_rw (c),
1156 sizeof (union value) * r->flt64_cnt))
1162 /* Convert floating point numbers to native format if needed. */
1163 if (r->float_format != FLOAT_NATIVE_DOUBLE)
1167 for (i = 0; i < r->var_cnt; i++)
1168 if (r->vars[i].width == 0)
1170 double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
1171 float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
1178 /* Slow path. Convert from external to internal format. */
1181 for (i = 0; i < r->var_cnt; i++)
1183 struct sfm_var *sv = &r->vars[i];
1184 union value *v = case_data_rw_idx (c, sv->case_index);
1188 if (!read_case_number (r, &v->f))
1193 /* Read the string data in segments up to 255 bytes
1194 at a time, packed into 8-byte units. */
1195 const int max_chunk = MIN_VERY_LONG_STRING - 1;
1196 int ofs, chunk_size;
1197 for (ofs = 0; ofs < sv->width; ofs += chunk_size)
1199 chunk_size = MIN (max_chunk, sv->width - ofs);
1200 if (!read_case_string (r, v->s + ofs, chunk_size))
1208 /* Very long strings have trailing wasted space
1209 that we must skip. */
1210 if (sv->width >= MIN_VERY_LONG_STRING)
1212 int bytes_read = (sv->width / max_chunk * 256
1213 + ROUND_UP (sv->width % max_chunk, 8));
1214 int total_bytes = sfm_width_to_bytes (sv->width);
1215 int excess_bytes = total_bytes - bytes_read;
1217 while (excess_bytes > 0)
1220 size_t chunk = MIN (sizeof buffer, excess_bytes);
1221 if (!read_whole_strings (r, buffer, chunk))
1223 excess_bytes -= chunk;
1238 /* Issues an error that R ends in a partial record. */
1240 partial_record (struct sfm_reader *r)
1242 sys_error (r, _("File ends in partial case."));
1245 /* Reads a number from R and stores its value in *D.
1246 If R is compressed, reads a compressed number;
1247 otherwise, reads a number in the regular way.
1248 Returns true if successful, false if end of file is
1249 reached immediately. */
1251 read_case_number (struct sfm_reader *r, double *d)
1256 if (!try_read_bytes (r, flt64, sizeof flt64))
1258 *d = flt64_to_double (r, flt64);
1262 return read_compressed_number (r, d);
1265 /* Reads LENGTH string bytes from R into S.
1266 Always reads a multiple of 8 bytes; if LENGTH is not a
1267 multiple of 8, then extra bytes are read and discarded without
1269 Reads compressed strings if S is compressed.
1270 Returns true if successful, false if end of file is
1271 reached immediately. */
1273 read_case_string (struct sfm_reader *r, char *s, size_t length)
1275 size_t whole = ROUND_DOWN (length, 8);
1276 size_t partial = length % 8;
1280 if (!read_whole_strings (r, s, whole))
1287 if (!read_whole_strings (r, bounce, sizeof bounce))
1293 memcpy (s + whole, bounce, partial);
1299 /* Reads and returns the next compression opcode from R. */
1301 read_opcode (struct sfm_reader *r)
1303 assert (r->compressed);
1307 if (r->opcode_idx >= sizeof r->opcodes)
1309 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1313 opcode = r->opcodes[r->opcode_idx++];
1320 /* Reads a compressed number from R and stores its value in D.
1321 Returns true if successful, false if end of file is
1322 reached immediately. */
1324 read_compressed_number (struct sfm_reader *r, double *d)
1326 int opcode = read_opcode (r);
1334 *d = read_flt64 (r);
1338 sys_error (r, _("Compressed data is corrupt."));
1345 *d = opcode - r->bias;
1352 /* Reads a compressed 8-byte string segment from R and stores it
1354 Returns true if successful, false if end of file is
1355 reached immediately. */
1357 read_compressed_string (struct sfm_reader *r, char *dst)
1359 switch (read_opcode (r))
1366 read_bytes (r, dst, 8);
1370 memset (dst, ' ', 8);
1374 sys_error (r, _("Compressed data is corrupt."));
1380 /* Reads LENGTH string bytes from R into S.
1381 LENGTH must be a multiple of 8.
1382 Reads compressed strings if S is compressed.
1383 Returns true if successful, false if end of file is
1384 reached immediately. */
1386 read_whole_strings (struct sfm_reader *r, char *s, size_t length)
1388 assert (length % 8 == 0);
1390 return try_read_bytes (r, s, length);
1394 for (ofs = 0; ofs < length; ofs += 8)
1395 if (!read_compressed_string (r, s + ofs))
1405 /* Creates and returns a table that can be used for translating a value
1406 index into a case to a "struct variable *" for DICT. Multiple
1407 system file fields reference variables this way.
1409 This table must be created before processing the very long
1410 string extension record, because that record causes some
1411 values to be deleted from the case and the dictionary to be
1413 static struct variable **
1414 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1416 struct variable **var_by_value_idx;
1420 var_by_value_idx = pool_nmalloc (r->pool,
1421 r->flt64_cnt, sizeof *var_by_value_idx);
1422 for (i = 0; i < dict_get_var_cnt (dict); i++)
1424 struct variable *v = dict_get_var (dict, i);
1425 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
1428 var_by_value_idx[value_idx++] = v;
1429 for (j = 1; j < nv; j++)
1430 var_by_value_idx[value_idx++] = NULL;
1432 assert (value_idx == r->flt64_cnt);
1434 return var_by_value_idx;
1437 /* Returns the "struct variable" corresponding to the given
1438 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
1440 static struct variable *
1441 lookup_var_by_value_idx (struct sfm_reader *r,
1442 struct variable **var_by_value_idx, int value_idx)
1444 struct variable *var;
1446 if (value_idx < 1 || value_idx > r->flt64_cnt)
1447 sys_error (r, _("Variable index %d not in valid range 1...%d."),
1448 value_idx, r->flt64_cnt);
1450 var = var_by_value_idx[value_idx - 1];
1452 sys_error (r, _("Variable index %d refers to long string "
1459 /* Returns the variable in D with the given SHORT_NAME,
1460 or a null pointer if there is none. */
1461 static struct variable *
1462 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
1464 struct variable *var;
1468 /* First try looking up by full name. This often succeeds. */
1469 var = dict_lookup_var (d, short_name);
1470 if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
1473 /* Iterate through the whole dictionary as a fallback. */
1474 var_cnt = dict_get_var_cnt (d);
1475 for (i = 0; i < var_cnt; i++)
1477 var = dict_get_var (d, i);
1478 if (!strcasecmp (var_get_short_name (var), short_name))
1485 /* Helpers for reading records that contain "variable=value"
1489 struct variable_to_value_map
1491 struct substring buffer; /* Record contents. */
1492 size_t pos; /* Current position in buffer. */
1495 /* Reads SIZE bytes into a "variable=value" map for R,
1496 and returns the map. */
1497 static struct variable_to_value_map *
1498 open_variable_to_value_map (struct sfm_reader *r, size_t size)
1500 struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
1501 char *buffer = pool_malloc (r->pool, size + 1);
1502 read_bytes (r, buffer, size);
1503 map->buffer = ss_buffer (buffer, size);
1508 /* Closes MAP and frees its storage.
1509 Not really needed, because the pool will free the map anyway,
1510 but can be used to free it earlier. */
1512 close_variable_to_value_map (struct sfm_reader *r,
1513 struct variable_to_value_map *map)
1515 pool_free (r->pool, ss_data (map->buffer));
1518 /* Reads the next variable=value pair from MAP.
1519 Looks up the variable in DICT and stores it into *VAR.
1520 Stores a null-terminated value into *VALUE. */
1522 read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
1523 struct variable_to_value_map *map,
1524 struct variable **var, char **value,
1527 int max_warnings = 5;
1531 struct substring short_name_ss, value_ss;
1533 if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
1534 || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
1537 if (*warning_cnt > max_warnings)
1538 sys_warn (r, _("Suppressed %d additional variable map warnings."),
1539 *warning_cnt - max_warnings);
1543 map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
1544 ss_buffer ("\t\0", 2));
1546 ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
1547 *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
1550 if (++*warning_cnt <= 5)
1551 sys_warn (r, _("Variable map refers to unknown variable %s."),
1552 ss_data (short_name_ss));
1556 ss_data (value_ss)[ss_length (value_ss)] = '\0';
1557 *value = ss_data (value_ss);
1565 /* Displays a corruption message. */
1567 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
1572 ds_init_empty (&text);
1573 ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
1574 fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
1575 ds_put_vformat (&text, format, args);
1577 m.category = msg_class_to_category (class);
1578 m.severity = msg_class_to_severity (class);
1579 m.where.file_name = NULL;
1580 m.where.line_number = 0;
1581 m.text = ds_cstr (&text);
1586 /* Displays a warning for the current file position. */
1588 sys_warn (struct sfm_reader *r, const char *format, ...)
1592 va_start (args, format);
1593 sys_msg (r, MW, format, args);
1597 /* Displays an error for the current file position,
1598 marks it as in an error state,
1599 and aborts reading it using longjmp. */
1601 sys_error (struct sfm_reader *r, const char *format, ...)
1605 va_start (args, format);
1606 sys_msg (r, ME, format, args);
1610 longjmp (r->bail_out, 1);
1613 /* Reads BYTE_CNT bytes into BUF.
1614 Returns true if exactly BYTE_CNT bytes are successfully read.
1615 Aborts if an I/O error or a partial read occurs.
1616 If EOF_IS_OK, then an immediate end-of-file causes false to be
1617 returned; otherwise, immediate end-of-file causes an abort
1620 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
1621 void *buf, size_t byte_cnt)
1623 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
1624 if (bytes_read == byte_cnt)
1626 else if (ferror (r->file))
1627 sys_error (r, _("System error: %s."), strerror (errno));
1628 else if (!eof_is_ok || bytes_read != 0)
1629 sys_error (r, _("Unexpected end of file."));
1634 /* Reads BYTE_CNT into BUF.
1635 Aborts upon I/O error or if end-of-file is encountered. */
1637 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1639 read_bytes_internal (r, false, buf, byte_cnt);
1642 /* Reads BYTE_CNT bytes into BUF.
1643 Returns true if exactly BYTE_CNT bytes are successfully read.
1644 Returns false if an immediate end-of-file is encountered.
1645 Aborts if an I/O error or a partial read occurs. */
1647 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
1649 return read_bytes_internal (r, true, buf, byte_cnt);
1652 /* Reads a 32-bit signed integer from R and returns its value in
1655 read_int32 (struct sfm_reader *r)
1658 read_bytes (r, int32, sizeof int32);
1659 return int32_to_native (r, int32);
1662 /* Reads a 64-bit floating-point number from R and returns its
1663 value in host format. */
1665 read_flt64 (struct sfm_reader *r)
1668 read_bytes (r, flt64, sizeof flt64);
1669 return flt64_to_double (r, flt64);
1672 /* Reads exactly SIZE - 1 bytes into BUFFER
1673 and stores a null byte into BUFFER[SIZE - 1]. */
1675 read_string (struct sfm_reader *r, char *buffer, size_t size)
1678 read_bytes (r, buffer, size - 1);
1679 buffer[size - 1] = '\0';
1682 /* Skips BYTES bytes forward in R. */
1684 skip_bytes (struct sfm_reader *r, size_t bytes)
1689 size_t chunk = MIN (sizeof buffer, bytes);
1690 read_bytes (r, buffer, chunk);
1695 /* Returns the value of the 32-bit signed integer at INT32,
1696 converted from the format used by R to the host format. */
1698 int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
1701 if (r->integer_format == INTEGER_NATIVE)
1702 memcpy (&x, int32, sizeof x);
1704 x = integer_get (r->integer_format, int32, sizeof x);
1708 /* Returns the value of the 64-bit floating point number at
1709 FLT64, converted from the format used by R to the host
1712 flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
1715 if (r->float_format == FLOAT_NATIVE_DOUBLE)
1716 memcpy (&x, flt64, sizeof x);
1718 float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);
1722 static struct casereader_class sys_file_casereader_class =
1724 sys_file_casereader_read,
1725 sys_file_casereader_destroy,