1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/mrset.h"
38 #include "data/short-names.h"
39 #include "data/value-labels.h"
40 #include "data/value.h"
41 #include "data/variable.h"
42 #include "libpspp/array.h"
43 #include "libpspp/assertion.h"
44 #include "libpspp/compiler.h"
45 #include "libpspp/hash.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/minmax.h"
56 #include "gl/unlocked-io.h"
57 #include "gl/xalloc.h"
61 #define _(msgid) gettext (msgid)
62 #define N_(msgid) (msgid)
64 /* System file reader. */
67 /* Resource tracking. */
68 struct pool *pool; /* All system file state. */
69 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct file_handle *fh; /* File handle. */
73 struct fh_lock *lock; /* Mutual exclusion for file handle. */
74 FILE *file; /* File stream. */
75 bool error; /* I/O or corruption error? */
76 struct caseproto *proto; /* Format of output cases. */
79 enum integer_format integer_format; /* On-disk integer format. */
80 enum float_format float_format; /* On-disk floating point format. */
81 int oct_cnt; /* Number of 8-byte units per case. */
82 struct sfm_var *sfm_vars; /* Variables. */
83 size_t sfm_var_cnt; /* Number of variables. */
84 casenumber case_cnt; /* Number of cases */
85 bool has_long_var_names; /* File has a long variable name map */
88 bool compressed; /* File is compressed? */
89 double bias; /* Compression bias, usually 100.0. */
90 uint8_t opcodes[8]; /* Current block of opcodes. */
91 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
92 bool corruption_warning; /* Warned about possible corruption? */
95 static const struct casereader_class sys_file_casereader_class;
97 static bool close_reader (struct sfm_reader *);
99 static struct variable **make_var_by_value_idx (struct sfm_reader *,
100 struct dictionary *);
101 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
104 static struct variable *lookup_var_by_short_name (struct dictionary *,
105 const char *short_name);
107 static void sys_msg (struct sfm_reader *r, int class,
108 const char *format, va_list args)
109 PRINTF_FORMAT (3, 0);
110 static void sys_warn (struct sfm_reader *, const char *, ...)
111 PRINTF_FORMAT (2, 3);
112 static void sys_error (struct sfm_reader *, const char *, ...)
116 static void read_bytes (struct sfm_reader *, void *, size_t);
117 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
118 static int read_int (struct sfm_reader *);
119 static double read_float (struct sfm_reader *);
120 static void read_string (struct sfm_reader *, char *, size_t);
121 static void skip_bytes (struct sfm_reader *, size_t);
123 static struct text_record *open_text_record (struct sfm_reader *, size_t size);
124 static void close_text_record (struct sfm_reader *r,
125 struct text_record *);
126 static bool read_variable_to_value_pair (struct sfm_reader *,
128 struct text_record *,
129 struct variable **var, char **value);
130 static void text_warn (struct sfm_reader *r, struct text_record *text,
131 const char *format, ...)
132 PRINTF_FORMAT (3, 4);
133 static char *text_get_token (struct text_record *,
134 struct substring delimiters, char *delimiter);
135 static bool text_match (struct text_record *, char c);
136 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
137 struct text_record *,
138 struct substring delimiters,
140 static const char *text_parse_counted_string (struct sfm_reader *,
141 struct text_record *);
142 static size_t text_pos (const struct text_record *);
144 static bool close_reader (struct sfm_reader *r);
146 /* Dictionary reader. */
154 static void read_header (struct sfm_reader *, struct dictionary *,
155 int *weight_idx, int *claimed_oct_cnt,
156 struct sfm_read_info *);
157 static void read_variable_record (struct sfm_reader *, struct dictionary *,
158 int *format_warning_cnt);
159 static void parse_format_spec (struct sfm_reader *, unsigned int,
160 enum which_format, struct variable *,
161 int *format_warning_cnt);
162 static void setup_weight (struct sfm_reader *, int weight_idx,
163 struct variable **var_by_value_idx,
164 struct dictionary *);
165 static void read_documents (struct sfm_reader *, struct dictionary *);
166 static void read_value_labels (struct sfm_reader *, struct dictionary *,
167 struct variable **var_by_value_idx);
169 static void read_extension_record (struct sfm_reader *, struct dictionary *,
170 struct sfm_read_info *);
171 static void read_machine_integer_info (struct sfm_reader *,
172 size_t size, size_t count,
173 struct sfm_read_info *,
176 static void read_machine_float_info (struct sfm_reader *,
177 size_t size, size_t count);
178 static void read_mrsets (struct sfm_reader *, size_t size, size_t count,
179 struct dictionary *);
180 static void read_display_parameters (struct sfm_reader *,
181 size_t size, size_t count,
182 struct dictionary *);
183 static void read_long_var_name_map (struct sfm_reader *,
184 size_t size, size_t count,
185 struct dictionary *);
186 static void read_long_string_map (struct sfm_reader *,
187 size_t size, size_t count,
188 struct dictionary *);
189 static void read_data_file_attributes (struct sfm_reader *,
190 size_t size, size_t count,
191 struct dictionary *);
192 static void read_variable_attributes (struct sfm_reader *,
193 size_t size, size_t count,
194 struct dictionary *);
195 static void read_long_string_value_labels (struct sfm_reader *,
196 size_t size, size_t count,
197 struct dictionary *);
199 /* Convert all the strings in DICT from the dict encoding to UTF8 */
201 recode_strings (struct dictionary *dict)
205 const char *enc = dict_get_encoding (dict);
208 enc = get_default_encoding ();
210 for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
212 /* Convert the long variable name */
213 struct variable *var = dict_get_var (dict, i);
214 const char *native_name = var_get_name (var);
215 char *utf8_name = recode_string (UTF8, enc, native_name, -1);
216 if ( 0 != strcmp (utf8_name, native_name))
218 if ( NULL == dict_lookup_var (dict, utf8_name))
219 dict_rename_var (dict, var, utf8_name);
222 _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name);
227 /* Convert the variable label */
228 if (var_has_label (var))
230 char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
231 var_set_label (var, utf8_label);
235 if (var_has_value_labels (var))
237 const struct val_lab *vl = NULL;
238 const struct val_labs *vlabs = var_get_value_labels (var);
240 for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl))
242 const union value *val = val_lab_get_value (vl);
243 const char *label = val_lab_get_label (vl);
244 char *new_label = NULL;
246 new_label = recode_string (UTF8, enc, label, -1);
248 var_replace_value_label (var, val, new_label);
255 /* Opens the system file designated by file handle FH for
256 reading. Reads the system file's dictionary into *DICT.
257 If INFO is non-null, then it receives additional info about the
260 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
261 struct sfm_read_info *volatile info)
263 struct sfm_reader *volatile r = NULL;
264 struct variable **var_by_value_idx;
265 struct sfm_read_info local_info;
266 int format_warning_cnt = 0;
271 *dict = dict_create ();
273 /* Create and initialize reader. */
274 r = pool_create_container (struct sfm_reader, pool);
280 r->has_long_var_names = false;
281 r->opcode_idx = sizeof r->opcodes;
282 r->corruption_warning = false;
284 /* TRANSLATORS: this fragment will be interpolated into
285 messages in fh_lock() that identify types of files. */
286 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
290 r->file = fn_open (fh_get_file_name (fh), "rb");
293 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
294 fh_get_file_name (r->fh), strerror (errno));
298 /* Initialize info. */
301 memset (info, 0, sizeof *info);
303 if (setjmp (r->bail_out))
308 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
310 /* Read all the variable definition records. */
311 rec_type = read_int (r);
312 while (rec_type == 2)
314 read_variable_record (r, *dict, &format_warning_cnt);
315 rec_type = read_int (r);
318 /* Figure out the case format. */
319 var_by_value_idx = make_var_by_value_idx (r, *dict);
320 setup_weight (r, weight_idx, var_by_value_idx, *dict);
322 /* Read all the rest of the dictionary records. */
323 while (rec_type != 999)
328 read_value_labels (r, *dict, var_by_value_idx);
332 sys_error (r, _("Misplaced type 4 record."));
335 read_documents (r, *dict);
339 read_extension_record (r, *dict, info);
343 sys_error (r, _("Unrecognized record type %d."), rec_type);
345 rec_type = read_int (r);
349 if ( ! r->has_long_var_names )
352 for (i = 0; i < dict_get_var_cnt (*dict); i++)
354 struct variable *var = dict_get_var (*dict, i);
355 char short_name[SHORT_NAME_LEN + 1];
356 char long_name[SHORT_NAME_LEN + 1];
358 strcpy (short_name, var_get_name (var));
360 strcpy (long_name, short_name);
361 str_lowercase (long_name);
363 /* Set long name. Renaming a variable may clear the short
364 name, but we want to retain it, so re-set it
366 dict_rename_var (*dict, var, long_name);
367 var_set_short_name (var, 0, short_name);
370 r->has_long_var_names = true;
373 recode_strings (*dict);
375 /* Read record 999 data, which is just filler. */
378 /* Warn if the actual amount of data per case differs from the
379 amount that the header claims. SPSS version 13 gets this
380 wrong when very long strings are involved, so don't warn in
382 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
383 && info->version_major != 13)
384 sys_warn (r, _("File header claims %d variable positions but "
385 "%d were read from file."),
386 claimed_oct_cnt, r->oct_cnt);
388 /* Create an index of dictionary variable widths for
389 sfm_read_case to use. We cannot use the `struct variable's
390 from the dictionary we created, because the caller owns the
391 dictionary and may destroy or modify its variables. */
392 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
393 pool_register (r->pool, free, r->sfm_vars);
394 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
396 pool_free (r->pool, var_by_value_idx);
397 return casereader_create_sequential
399 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
400 &sys_file_casereader_class, r);
404 dict_destroy (*dict);
409 /* Closes a system file after we're done with it.
410 Returns true if an I/O error has occurred on READER, false
413 close_reader (struct sfm_reader *r)
422 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
424 msg (ME, _("Error closing system file \"%s\": %s."),
425 fh_get_file_name (r->fh), strerror (errno));
435 pool_destroy (r->pool);
440 /* Destroys READER. */
442 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
444 struct sfm_reader *r = r_;
448 /* Returns true if FILE is an SPSS system file,
451 sfm_detect (FILE *file)
455 if (fread (rec_type, 4, 1, file) != 1)
459 return !strcmp ("$FL2", rec_type);
462 /* Reads the global header of the system file.
463 Sets DICT's file label to the system file's label.
464 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
465 or to the value index of the weight variable otherwise.
466 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
467 per case that the file claims to have (although it is not
469 Initializes INFO with header information. */
471 read_header (struct sfm_reader *r, struct dictionary *dict,
472 int *weight_idx, int *claimed_oct_cnt,
473 struct sfm_read_info *info)
476 char eye_catcher[61];
477 uint8_t raw_layout_code[4];
479 char creation_date[10];
480 char creation_time[9];
482 struct substring file_label_ss;
483 struct substring product;
485 read_string (r, rec_type, sizeof rec_type);
486 read_string (r, eye_catcher, sizeof eye_catcher);
488 if (strcmp ("$FL2", rec_type) != 0)
489 sys_error (r, _("This is not an SPSS system file."));
491 /* Identify integer format. */
492 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
493 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
495 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
497 || (r->integer_format != INTEGER_MSB_FIRST
498 && r->integer_format != INTEGER_LSB_FIRST))
499 sys_error (r, _("This is not an SPSS system file."));
501 *claimed_oct_cnt = read_int (r);
502 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
503 *claimed_oct_cnt = -1;
505 r->compressed = read_int (r) != 0;
507 *weight_idx = read_int (r);
509 r->case_cnt = read_int (r);
510 if ( r->case_cnt > INT_MAX / 2)
514 /* Identify floating-point format and obtain compression bias. */
515 read_bytes (r, raw_bias, sizeof raw_bias);
516 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
518 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
520 if (memcmp (raw_bias, zero_bias, 8))
521 sys_warn (r, _("Compression bias is not the usual "
522 "value of 100, or system file uses unrecognized "
523 "floating-point format."));
526 /* Some software is known to write all-zeros to this
527 field. Such software also writes floating-point
528 numbers in the format that we expect by default
529 (it seems that all software most likely does, in
530 reality), so don't warn in this case. */
533 if (r->integer_format == INTEGER_MSB_FIRST)
534 r->float_format = FLOAT_IEEE_DOUBLE_BE;
536 r->float_format = FLOAT_IEEE_DOUBLE_LE;
538 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
540 read_string (r, creation_date, sizeof creation_date);
541 read_string (r, creation_time, sizeof creation_time);
542 read_string (r, file_label, sizeof file_label);
545 file_label_ss = ss_cstr (file_label);
546 ss_trim (&file_label_ss, ss_cstr (" "));
547 if (!ss_is_empty (file_label_ss))
549 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
550 dict_set_label (dict, ss_data (file_label_ss));
553 strcpy (info->creation_date, creation_date);
554 strcpy (info->creation_time, creation_time);
555 info->integer_format = r->integer_format;
556 info->float_format = r->float_format;
557 info->compressed = r->compressed;
558 info->case_cnt = r->case_cnt;
560 product = ss_cstr (eye_catcher);
561 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
562 ss_trim (&product, ss_cstr (" "));
563 str_copy_buf_trunc (info->product, sizeof info->product,
564 ss_data (product), ss_length (product));
567 /* Reads a variable (type 2) record from R and adds the
568 corresponding variable to DICT.
569 Also skips past additional variable records for long string
572 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
573 int *format_warning_cnt)
576 int has_variable_label;
577 int missing_value_code;
582 struct variable *var;
585 width = read_int (r);
586 has_variable_label = read_int (r);
587 missing_value_code = read_int (r);
588 print_format = read_int (r);
589 write_format = read_int (r);
590 read_string (r, name, sizeof name);
591 name[strcspn (name, " ")] = '\0';
593 /* Check variable name. */
594 if (name[0] == '$' || name[0] == '#')
595 sys_error (r, "Variable name begins with invalid character `%c'.",
597 if (!var_is_plausible_name (name, false))
598 sys_error (r, _("Invalid variable name `%s'."), name);
600 /* Create variable. */
601 if (width < 0 || width > 255)
602 sys_error (r, _("Bad width %d for variable %s."), width, name);
603 var = dict_create_var (dict, name, width);
606 _("Duplicate variable name `%s' within system file."),
609 /* Set the short name the same as the long name. */
610 var_set_short_name (var, 0, var_get_name (var));
612 /* Get variable label, if any. */
613 if (has_variable_label != 0 && has_variable_label != 1)
614 sys_error (r, _("Variable label indicator field is not 0 or 1."));
615 if (has_variable_label == 1)
617 size_t len, read_len;
622 /* Read up to 255 bytes of label. */
623 read_len = MIN (sizeof label - 1, len);
624 read_string (r, label, read_len + 1);
625 var_set_label (var, label);
627 /* Skip unread label bytes. */
628 skip_bytes (r, len - read_len);
630 /* Skip label padding up to multiple of 4 bytes. */
631 skip_bytes (r, ROUND_UP (len, 4) - len);
634 /* Set missing values. */
635 if (missing_value_code != 0)
637 struct missing_values mv;
640 mv_init_pool (r->pool, &mv, var_get_width (var));
641 if (var_is_numeric (var))
643 if (missing_value_code < -3 || missing_value_code > 3
644 || missing_value_code == -1)
645 sys_error (r, _("Numeric missing value indicator field is not "
646 "-3, -2, 0, 1, 2, or 3."));
647 if (missing_value_code < 0)
649 double low = read_float (r);
650 double high = read_float (r);
651 mv_add_range (&mv, low, high);
652 missing_value_code = -missing_value_code - 2;
654 for (i = 0; i < missing_value_code; i++)
655 mv_add_num (&mv, read_float (r));
659 int mv_width = MAX (width, 8);
662 if (missing_value_code < 1 || missing_value_code > 3)
663 sys_error (r, _("String missing value indicator field is not "
666 value_init (&value, mv_width);
667 value_set_missing (&value, mv_width);
668 for (i = 0; i < missing_value_code; i++)
670 uint8_t *s = value_str_rw (&value, mv_width);
671 read_bytes (r, s, 8);
674 value_destroy (&value, mv_width);
676 var_set_missing_values (var, &mv);
680 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
681 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
683 /* Account for values.
684 Skip long string continuation records, if any. */
685 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
691 for (i = 1; i < nv; i++)
693 /* Check for record type 2 and width -1. */
694 if (read_int (r) != 2 || read_int (r) != -1)
695 sys_error (r, _("Missing string continuation record."));
697 /* Skip and ignore remaining continuation data. */
698 has_variable_label = read_int (r);
699 missing_value_code = read_int (r);
700 print_format = read_int (r);
701 write_format = read_int (r);
702 read_string (r, name, sizeof name);
704 /* Variable label fields on continuation records have
705 been spotted in system files created by "SPSS Power
706 Macintosh Release 6.1". */
707 if (has_variable_label)
708 skip_bytes (r, ROUND_UP (read_int (r), 4));
713 /* Translates the format spec from sysfile format to internal
716 parse_format_spec (struct sfm_reader *r, unsigned int s,
717 enum which_format which, struct variable *v,
718 int *format_warning_cnt)
720 const int max_format_warnings = 8;
722 uint8_t raw_type = s >> 16;
728 if (!fmt_from_io (raw_type, &f.type))
729 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
734 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
739 if (which == PRINT_FORMAT)
740 var_set_print_format (v, &f);
742 var_set_write_format (v, &f);
744 else if (*++format_warning_cnt <= max_format_warnings)
746 char fmt_string[FMT_STRING_LEN_MAX + 1];
747 sys_warn (r, _("%s variable %s has invalid %s format %s."),
748 var_is_numeric (v) ? _("Numeric") : _("String"),
750 which == PRINT_FORMAT ? _("print") : _("write"),
751 fmt_to_string (&f, fmt_string));
753 if (*format_warning_cnt == max_format_warnings)
754 sys_warn (r, _("Suppressing further invalid format warnings."));
758 /* Sets the weighting variable in DICT to the variable
759 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
762 setup_weight (struct sfm_reader *r, int weight_idx,
763 struct variable **var_by_value_idx, struct dictionary *dict)
767 struct variable *weight_var
768 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
769 if (var_is_numeric (weight_var))
770 dict_set_weight (dict, weight_var);
772 sys_error (r, _("Weighting variable must be numeric."));
776 /* Reads a document record, type 6, from system file R, and sets up
777 the documents and n_documents fields in the associated
780 read_documents (struct sfm_reader *r, struct dictionary *dict)
785 if (dict_get_documents (dict) != NULL)
786 sys_error (r, _("Multiple type 6 (document) records."));
788 line_cnt = read_int (r);
790 sys_error (r, _("Number of document lines (%d) "
791 "must be greater than 0."), line_cnt);
793 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
794 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
795 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
796 dict_set_documents (dict, documents);
798 sys_error (r, _("Document line contains null byte."));
799 pool_free (r->pool, documents);
802 /* Read a type 7 extension record. */
804 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
805 struct sfm_read_info *info)
807 int subtype = read_int (r);
808 size_t size = read_int (r);
809 size_t count = read_int (r);
810 size_t bytes = size * count;
812 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
813 allows an extra byte for a null terminator, used by some
814 extension processing routines. */
815 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
816 sys_error (r, "Record type 7 subtype %d too large.", subtype);
821 read_machine_integer_info (r, size, count, info, dict);
825 read_machine_float_info (r, size, count);
829 /* Variable sets information. We don't use these yet.
830 They only apply to GUIs; see VARSETS on the APPLY
831 DICTIONARY command in SPSS documentation. */
835 /* DATE variable information. We don't use it yet, but we
841 read_mrsets (r, size, count, dict);
845 /* Used by the SPSS Data Entry software. */
849 read_display_parameters (r, size, count, dict);
853 read_long_var_name_map (r, size, count, dict);
857 read_long_string_map (r, size, count, dict);
861 /* Extended number of cases. Not important. */
865 read_data_file_attributes (r, size, count, dict);
869 read_variable_attributes (r, size, count, dict);
873 /* New in SPSS 16. Contains a single string that describes
874 the character encoding, e.g. "windows-1252". */
876 char *encoding = pool_calloc (r->pool, size, count + 1);
877 read_string (r, encoding, count + 1);
878 dict_set_encoding (dict, encoding);
883 /* New in SPSS 16. Encodes value labels for long string
885 read_long_string_value_labels (r, size, count, dict);
889 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"),
890 subtype, PACKAGE_BUGREPORT);
894 skip_bytes (r, bytes);
897 /* Read record type 7, subtype 3. */
899 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
900 struct sfm_read_info *info,
901 struct dictionary *dict)
903 int version_major = read_int (r);
904 int version_minor = read_int (r);
905 int version_revision = read_int (r);
906 int machine_code UNUSED = read_int (r);
907 int float_representation = read_int (r);
908 int compression_code UNUSED = read_int (r);
909 int integer_representation = read_int (r);
910 int character_code = read_int (r);
912 int expected_float_format;
913 int expected_integer_format;
915 if (size != 4 || count != 8)
916 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
920 /* Save version info. */
921 info->version_major = version_major;
922 info->version_minor = version_minor;
923 info->version_revision = version_revision;
925 /* Check floating point format. */
926 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
927 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
928 expected_float_format = 1;
929 else if (r->float_format == FLOAT_Z_LONG)
930 expected_float_format = 2;
931 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
932 expected_float_format = 3;
935 if (float_representation != expected_float_format)
936 sys_error (r, _("Floating-point representation indicated by "
937 "system file (%d) differs from expected (%d)."),
938 r->float_format, expected_float_format);
940 /* Check integer format. */
941 if (r->integer_format == INTEGER_MSB_FIRST)
942 expected_integer_format = 1;
943 else if (r->integer_format == INTEGER_LSB_FIRST)
944 expected_integer_format = 2;
947 if (integer_representation != expected_integer_format)
949 static const char *const endian[] = {N_("Little Endian"), N_("Big Endian")};
950 sys_warn (r, _("Integer format indicated by system file (%s) "
951 "differs from expected (%s)."),
952 gettext (endian[integer_representation == 1]),
953 gettext (endian[expected_integer_format == 1]));
958 Record 7 (20) provides a much more reliable way of
959 setting the encoding.
960 The character_code is used as a fallback only.
962 if ( NULL == dict_get_encoding (dict))
964 switch (character_code)
967 dict_set_encoding (dict, "EBCDIC-US");
971 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
972 respectively. However, there are known to be many files
973 in the wild with character code 2, yet have data which are
975 Therefore we ignore these values.
979 dict_set_encoding (dict, "MS_KANJI");
982 dict_set_encoding (dict, "UTF-7");
985 dict_set_encoding (dict, "UTF-8");
990 snprintf (enc, 100, "CP%d", character_code);
991 dict_set_encoding (dict, enc);
998 /* Read record type 7, subtype 4. */
1000 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
1002 double sysmis = read_float (r);
1003 double highest = read_float (r);
1004 double lowest = read_float (r);
1006 if (size != 8 || count != 3)
1007 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
1010 if (sysmis != SYSMIS)
1011 sys_warn (r, _("File specifies unexpected value %g as %s."),
1014 if (highest != HIGHEST)
1015 sys_warn (r, _("File specifies unexpected value %g as %s."),
1016 highest, "HIGHEST");
1018 if (lowest != LOWEST)
1019 sys_warn (r, _("File specifies unexpected value %g as %s."),
1023 /* Read record type 7, subtype 7 or 19. */
1025 read_mrsets (struct sfm_reader *r, size_t size, size_t count,
1026 struct dictionary *dict)
1028 struct text_record *text;
1029 struct mrset *mrset;
1031 text = open_text_record (r, size * count);
1034 const char *name, *label, *counted;
1035 struct stringi_set var_names;
1036 size_t allocated_vars;
1040 mrset = xzalloc (sizeof *mrset);
1042 name = text_get_token (text, ss_cstr ("="), NULL);
1045 mrset->name = xstrdup (name);
1047 if (text_match (text, 'C'))
1049 mrset->type = MRSET_MC;
1050 if (!text_match (text, ' '))
1052 sys_warn (r, _("Missing space following 'C' at offset %zu "
1053 "in MRSETS record"), text_pos (text));
1057 else if (text_match (text, 'D'))
1059 mrset->type = MRSET_MD;
1060 mrset->cat_source = MRSET_VARLABELS;
1062 else if (text_match (text, 'E'))
1066 mrset->type = MRSET_MD;
1067 mrset->cat_source = MRSET_COUNTEDVALUES;
1068 if (!text_match (text, ' '))
1070 sys_warn (r, _("Missing space following 'E' at offset %zu "
1071 "in MRSETS record"), text_pos (text));
1075 number = text_get_token (text, ss_cstr (" "), NULL);
1076 if (!strcmp (number, "11"))
1077 mrset->label_from_var_label = true;
1078 else if (strcmp (number, "1"))
1079 sys_warn (r, _("Unexpected label source value \"%s\" "
1080 "following 'E' at offset %zu in MRSETS record"),
1081 number, text_pos (text));
1085 sys_warn (r, _("Missing 'C', 'D', or 'E' at offset %zu "
1086 "in MRSETS record."),
1091 if (mrset->type == MRSET_MD)
1093 counted = text_parse_counted_string (r, text);
1094 if (counted == NULL)
1098 label = text_parse_counted_string (r, text);
1101 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1103 stringi_set_init (&var_names);
1108 struct variable *var;
1109 const char *var_name;
1111 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1112 if (var_name == NULL)
1114 sys_warn (r, _("Missing new-line parsing variable names "
1115 "at offset %zu in MRSETS record."),
1120 var = lookup_var_by_short_name (dict, var_name);
1123 if (!stringi_set_insert (&var_names, var_name))
1125 sys_warn (r, _("Duplicate variable name %s "
1126 "at offset %zu in MRSETS record."),
1127 var_name, text_pos (text));
1131 if (mrset->label == NULL && mrset->label_from_var_label
1132 && var_has_label (var))
1133 mrset->label = xstrdup (var_get_label (var));
1136 && var_get_type (var) != var_get_type (mrset->vars[0]))
1138 sys_warn (r, _("MRSET %s contains both string and "
1139 "numeric variables."), name);
1142 width = MIN (width, var_get_width (var));
1144 if (mrset->n_vars >= allocated_vars)
1145 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1146 sizeof *mrset->vars);
1147 mrset->vars[mrset->n_vars++] = var;
1149 while (delimiter != '\n');
1151 if (mrset->n_vars < 2)
1153 sys_warn (r, _("MRSET %s has only %zu variables."), mrset->name,
1155 mrset_destroy (mrset);
1159 if (mrset->type == MRSET_MD)
1161 mrset->width = width;
1162 value_init (&mrset->counted, width);
1164 mrset->counted.f = strtod (counted, NULL);
1166 value_copy_str_rpad (&mrset->counted, width,
1167 (const uint8_t *) counted, ' ');
1170 dict_add_mrset (dict, mrset);
1173 mrset_destroy (mrset);
1174 close_text_record (r, text);
1177 /* Read record type 7, subtype 11, which specifies how variables
1178 should be displayed in GUI environments. */
1180 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
1181 struct dictionary *dict)
1184 bool includes_width;
1185 bool warned = false;
1190 sys_warn (r, _("Bad size %zu on extension 11."), size);
1191 skip_bytes (r, size * count);
1195 n_vars = dict_get_var_cnt (dict);
1196 if (count == 3 * n_vars)
1197 includes_width = true;
1198 else if (count == 2 * n_vars)
1199 includes_width = false;
1202 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
1204 skip_bytes (r, size * count);
1208 for (i = 0; i < n_vars; ++i)
1210 struct variable *v = dict_get_var (dict, i);
1211 int measure = read_int (r);
1212 int width = includes_width ? read_int (r) : 0;
1213 int align = read_int (r);
1215 /* SPSS 14 sometimes seems to set string variables' measure
1217 if (0 == measure && var_is_alpha (v))
1220 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1223 sys_warn (r, _("Invalid variable display parameters "
1224 "for variable %zu (%s). "
1225 "Default parameters substituted."),
1226 i, var_get_name (v));
1231 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1232 : measure == 2 ? MEASURE_ORDINAL
1234 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1235 : align == 1 ? ALIGN_RIGHT
1238 /* Older versions (SPSS 9.0) sometimes set the display
1239 width to zero. This causes confusion in the GUI, so
1240 only set the width if it is nonzero. */
1242 var_set_display_width (v, width);
1246 /* Reads record type 7, subtype 13, which gives the long name
1247 that corresponds to each short name. Modifies variable names
1248 in DICT accordingly. */
1250 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
1251 struct dictionary *dict)
1253 struct text_record *text;
1254 struct variable *var;
1257 text = open_text_record (r, size * count);
1258 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1261 size_t short_name_cnt;
1264 /* Validate long name. */
1265 if (!var_is_valid_name (long_name, false))
1267 sys_warn (r, _("Long variable mapping from %s to invalid "
1268 "variable name `%s'."),
1269 var_get_name (var), long_name);
1273 /* Identify any duplicates. */
1274 if (strcasecmp (var_get_short_name (var, 0), long_name)
1275 && dict_lookup_var (dict, long_name) != NULL)
1277 sys_warn (r, _("Duplicate long variable name `%s' "
1278 "within system file."), long_name);
1282 /* Renaming a variable may clear its short names, but we
1283 want to retain them, so we save them and re-set them
1285 short_name_cnt = var_get_short_name_cnt (var);
1286 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
1287 for (i = 0; i < short_name_cnt; i++)
1289 const char *s = var_get_short_name (var, i);
1290 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1293 /* Set long name. */
1294 dict_rename_var (dict, var, long_name);
1296 /* Restore short names. */
1297 for (i = 0; i < short_name_cnt; i++)
1299 var_set_short_name (var, i, short_names[i]);
1300 free (short_names[i]);
1304 close_text_record (r, text);
1305 r->has_long_var_names = true;
1308 /* Reads record type 7, subtype 14, which gives the real length
1309 of each very long string. Rearranges DICT accordingly. */
1311 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1312 struct dictionary *dict)
1314 struct text_record *text;
1315 struct variable *var;
1318 text = open_text_record (r, size * count);
1319 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1321 size_t idx = var_get_dict_index (var);
1327 length = strtol (length_s, NULL, 10);
1328 if (length < 1 || length > MAX_STRING)
1330 sys_warn (r, _("%s listed as string of invalid length %s "
1331 "in very length string record."),
1332 var_get_name (var), length_s);
1336 /* Check segments. */
1337 segment_cnt = sfm_width_to_segments (length);
1338 if (segment_cnt == 1)
1340 sys_warn (r, _("%s listed in very long string record with width %s, "
1341 "which requires only one segment."),
1342 var_get_name (var), length_s);
1345 if (idx + segment_cnt > dict_get_var_cnt (dict))
1346 sys_error (r, _("Very long string %s overflows dictionary."),
1347 var_get_name (var));
1349 /* Get the short names from the segments and check their
1351 for (i = 0; i < segment_cnt; i++)
1353 struct variable *seg = dict_get_var (dict, idx + i);
1354 int alloc_width = sfm_segment_alloc_width (length, i);
1355 int width = var_get_width (seg);
1358 var_set_short_name (var, i, var_get_short_name (seg, 0));
1359 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1360 sys_error (r, _("Very long string with width %ld has segment %d "
1361 "of width %d (expected %d)"),
1362 length, i, width, alloc_width);
1364 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1365 var_set_width (var, length);
1367 close_text_record (r, text);
1368 dict_compact_values (dict);
1371 /* Reads value labels from sysfile H and inserts them into the
1372 associated dictionary. */
1374 read_value_labels (struct sfm_reader *r,
1375 struct dictionary *dict, struct variable **var_by_value_idx)
1377 struct pool *subpool;
1381 uint8_t raw_value[8]; /* Value as uninterpreted bytes. */
1382 union value value; /* Value. */
1383 char *label; /* Null-terminated label string. */
1386 struct label *labels = NULL;
1387 int label_cnt; /* Number of labels. */
1389 struct variable **var = NULL; /* Associated variables. */
1390 int var_cnt; /* Number of associated variables. */
1391 int max_width; /* Maximum width of string variables. */
1395 subpool = pool_create_subpool (r->pool);
1397 /* Read the type 3 record and record its contents. We can't do
1398 much with the data yet because we don't know whether it is
1399 of numeric or string type. */
1401 /* Read number of labels. */
1402 label_cnt = read_int (r);
1404 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1406 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1411 /* Read each value/label tuple into labels[]. */
1412 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1413 for (i = 0; i < label_cnt; i++)
1415 struct label *label = labels + i;
1416 unsigned char label_len;
1420 read_bytes (r, label->raw_value, sizeof label->raw_value);
1422 /* Read label length. */
1423 read_bytes (r, &label_len, sizeof label_len);
1424 padded_len = ROUND_UP (label_len + 1, 8);
1426 /* Read label, padding. */
1427 label->label = pool_alloc (subpool, padded_len + 1);
1428 read_bytes (r, label->label, padded_len - 1);
1429 label->label[label_len] = 0;
1432 /* Now, read the type 4 record that has the list of variables
1433 to which the value labels are to be applied. */
1435 /* Read record type of type 4 record. */
1436 if (read_int (r) != 4)
1437 sys_error (r, _("Variable index record (type 4) does not immediately "
1438 "follow value label record (type 3) as it should."));
1440 /* Read number of variables associated with value label from type 4
1442 var_cnt = read_int (r);
1443 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1444 sys_error (r, _("Number of variables associated with a value label (%d) "
1445 "is not between 1 and the number of variables (%zu)."),
1446 var_cnt, dict_get_var_cnt (dict));
1448 /* Read the list of variables. */
1449 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1451 for (i = 0; i < var_cnt; i++)
1453 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1454 if (var_get_width (var[i]) > 8)
1455 sys_error (r, _("Value labels may not be added to long string "
1456 "variables (e.g. %s) using records types 3 and 4."),
1457 var_get_name (var[i]));
1458 max_width = MAX (max_width, var_get_width (var[i]));
1461 /* Type check the variables. */
1462 for (i = 1; i < var_cnt; i++)
1463 if (var_get_type (var[i]) != var_get_type (var[0]))
1464 sys_error (r, _("Variables associated with value label are not all of "
1465 "identical type. Variable %s is %s, but variable "
1467 var_get_name (var[0]),
1468 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1469 var_get_name (var[i]),
1470 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1472 /* Fill in labels[].value, now that we know the desired type. */
1473 for (i = 0; i < label_cnt; i++)
1475 struct label *label = labels + i;
1477 value_init_pool (subpool, &label->value, max_width);
1478 if (var_is_alpha (var[0]))
1479 u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
1480 label->raw_value, sizeof label->raw_value, ' ');
1482 label->value.f = float_get_double (r->float_format, label->raw_value);
1485 /* Assign the `value_label's to each variable. */
1486 for (i = 0; i < var_cnt; i++)
1488 struct variable *v = var[i];
1491 /* Add each label to the variable. */
1492 for (j = 0; j < label_cnt; j++)
1494 struct label *label = &labels[j];
1495 if (!var_add_value_label (v, &label->value, label->label))
1497 if (var_is_numeric (var[0]))
1498 sys_warn (r, _("Duplicate value label for %g on %s."),
1499 label->value.f, var_get_name (v));
1501 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1502 max_width, value_str (&label->value, max_width),
1508 pool_destroy (subpool);
1511 /* Reads a set of custom attributes from TEXT into ATTRS.
1512 ATTRS may be a null pointer, in which case the attributes are
1513 read but discarded. */
1515 read_attributes (struct sfm_reader *r, struct text_record *text,
1516 struct attrset *attrs)
1520 struct attribute *attr;
1524 /* Parse the key. */
1525 key = text_get_token (text, ss_cstr ("("), NULL);
1529 attr = attribute_create (key);
1530 for (index = 1; ; index++)
1532 /* Parse the value. */
1536 value = text_get_token (text, ss_cstr ("\n"), NULL);
1539 text_warn (r, text, _("Error parsing attribute value %s[%d]"),
1544 length = strlen (value);
1545 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1547 value[length - 1] = '\0';
1548 attribute_add_value (attr, value + 1);
1553 _("Attribute value %s[%d] is not quoted: %s"),
1555 attribute_add_value (attr, value);
1558 /* Was this the last value for this attribute? */
1559 if (text_match (text, ')'))
1563 attrset_add (attrs, attr);
1565 attribute_destroy (attr);
1567 while (!text_match (text, '/'));
1570 /* Reads record type 7, subtype 17, which lists custom
1571 attributes on the data file. */
1573 read_data_file_attributes (struct sfm_reader *r,
1574 size_t size, size_t count,
1575 struct dictionary *dict)
1577 struct text_record *text = open_text_record (r, size * count);
1578 read_attributes (r, text, dict_get_attributes (dict));
1579 close_text_record (r, text);
1583 skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels)
1587 for (i = 0; i < n_labels; i++)
1589 size_t value_length, label_length;
1591 value_length = read_int (r);
1592 skip_bytes (r, value_length);
1593 label_length = read_int (r);
1594 skip_bytes (r, label_length);
1599 read_long_string_value_labels (struct sfm_reader *r,
1600 size_t size, size_t count,
1601 struct dictionary *d)
1603 const off_t start = ftello (r->file);
1604 while (ftello (r->file) - start < size * count)
1606 char var_name[VAR_NAME_LEN + 1];
1614 var_name_len = read_int (r);
1615 if (var_name_len > VAR_NAME_LEN)
1616 sys_error (r, _("Variable name length in long string value label "
1617 "record (%d) exceeds %d-byte limit."),
1618 var_name_len, VAR_NAME_LEN);
1619 read_string (r, var_name, var_name_len + 1);
1620 width = read_int (r);
1621 n_labels = read_int (r);
1623 v = dict_lookup_var (d, var_name);
1626 sys_warn (r, _("Ignoring long string value record for "
1627 "unknown variable %s."), var_name);
1628 skip_long_string_value_labels (r, n_labels);
1631 if (var_is_numeric (v))
1633 sys_warn (r, _("Ignoring long string value record for "
1634 "numeric variable %s."), var_name);
1635 skip_long_string_value_labels (r, n_labels);
1638 if (width != var_get_width (v))
1640 sys_warn (r, _("Ignoring long string value record for variable %s "
1641 "because the record's width (%d) does not match the "
1642 "variable's width (%d)"),
1643 var_name, width, var_get_width (v));
1644 skip_long_string_value_labels (r, n_labels);
1649 value_init_pool (r->pool, &value, width);
1650 for (i = 0; i < n_labels; i++)
1652 size_t value_length, label_length;
1657 value_length = read_int (r);
1658 if (value_length == width)
1659 read_bytes (r, value_str_rw (&value, width), width);
1662 sys_warn (r, _("Ignoring long string value %zu for variable %s, "
1663 "with width %d, that has bad value width %zu."),
1664 i, var_get_name (v), width, value_length);
1665 skip_bytes (r, value_length);
1670 label_length = read_int (r);
1671 read_string (r, label, MIN (sizeof label, label_length + 1));
1672 if (label_length >= sizeof label)
1674 /* Skip and silently ignore label text after the
1675 first 255 bytes. The maximum documented length
1676 of a label is 120 bytes so this is more than
1678 skip_bytes (r, sizeof label - (label_length + 1));
1681 if (!skip && !var_add_value_label (v, &value, label))
1682 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1683 width, value_str (&value, width), var_get_name (v));
1689 /* Reads record type 7, subtype 18, which lists custom
1690 attributes on individual variables. */
1692 read_variable_attributes (struct sfm_reader *r,
1693 size_t size, size_t count,
1694 struct dictionary *dict)
1696 struct text_record *text = open_text_record (r, size * count);
1699 struct variable *var;
1700 if (!text_read_short_name (r, dict, text, ss_cstr (":"), &var))
1702 read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1704 close_text_record (r, text);
1710 static void partial_record (struct sfm_reader *r)
1713 static void read_error (struct casereader *, const struct sfm_reader *);
1715 static bool read_case_number (struct sfm_reader *, double *);
1716 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1717 static int read_opcode (struct sfm_reader *);
1718 static bool read_compressed_number (struct sfm_reader *, double *);
1719 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1720 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1721 static bool skip_whole_strings (struct sfm_reader *, size_t);
1723 /* Reads and returns one case from READER's file. Returns a null
1724 pointer if not successful. */
1725 static struct ccase *
1726 sys_file_casereader_read (struct casereader *reader, void *r_)
1728 struct sfm_reader *r = r_;
1729 struct ccase *volatile c;
1735 c = case_create (r->proto);
1736 if (setjmp (r->bail_out))
1738 casereader_force_error (reader);
1743 for (i = 0; i < r->sfm_var_cnt; i++)
1745 struct sfm_var *sv = &r->sfm_vars[i];
1746 union value *v = case_data_rw_idx (c, sv->case_index);
1748 if (sv->var_width == 0)
1750 if (!read_case_number (r, &v->f))
1755 uint8_t *s = value_str_rw (v, sv->var_width);
1756 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1758 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1768 if (r->case_cnt != -1)
1769 read_error (reader, r);
1773 /* Issues an error that R ends in a partial record. */
1775 partial_record (struct sfm_reader *r)
1777 sys_error (r, _("File ends in partial case."));
1780 /* Issues an error that an unspecified error occurred SFM, and
1783 read_error (struct casereader *r, const struct sfm_reader *sfm)
1785 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1786 casereader_force_error (r);
1789 /* Reads a number from R and stores its value in *D.
1790 If R is compressed, reads a compressed number;
1791 otherwise, reads a number in the regular way.
1792 Returns true if successful, false if end of file is
1793 reached immediately. */
1795 read_case_number (struct sfm_reader *r, double *d)
1800 if (!try_read_bytes (r, number, sizeof number))
1802 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1806 return read_compressed_number (r, d);
1809 /* Reads LENGTH string bytes from R into S.
1810 Always reads a multiple of 8 bytes; if LENGTH is not a
1811 multiple of 8, then extra bytes are read and discarded without
1813 Reads compressed strings if S is compressed.
1814 Returns true if successful, false if end of file is
1815 reached immediately. */
1817 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
1819 size_t whole = ROUND_DOWN (length, 8);
1820 size_t partial = length % 8;
1824 if (!read_whole_strings (r, s, whole))
1831 if (!read_whole_strings (r, bounce, sizeof bounce))
1837 memcpy (s + whole, bounce, partial);
1843 /* Reads and returns the next compression opcode from R. */
1845 read_opcode (struct sfm_reader *r)
1847 assert (r->compressed);
1851 if (r->opcode_idx >= sizeof r->opcodes)
1853 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1857 opcode = r->opcodes[r->opcode_idx++];
1864 /* Reads a compressed number from R and stores its value in D.
1865 Returns true if successful, false if end of file is
1866 reached immediately. */
1868 read_compressed_number (struct sfm_reader *r, double *d)
1870 int opcode = read_opcode (r);
1878 *d = read_float (r);
1882 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
1883 if (!r->corruption_warning)
1885 r->corruption_warning = true;
1886 sys_warn (r, _("Possible compressed data corruption: "
1887 "compressed spaces appear in numeric field."));
1896 *d = opcode - r->bias;
1903 /* Reads a compressed 8-byte string segment from R and stores it
1905 Returns true if successful, false if end of file is
1906 reached immediately. */
1908 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
1910 int opcode = read_opcode (r);
1918 read_bytes (r, dst, 8);
1922 memset (dst, ' ', 8);
1927 double value = opcode - r->bias;
1928 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
1931 /* This has actually been seen "in the wild". The submitter of the
1932 file that showed that the contents decoded as spaces, but they
1933 were at the end of the field so it's possible that the null
1934 bytes just acted as null terminators. */
1936 else if (!r->corruption_warning)
1938 r->corruption_warning = true;
1939 sys_warn (r, _("Possible compressed data corruption: "
1940 "string contains compressed integer (opcode %d)"),
1950 /* Reads LENGTH string bytes from R into S.
1951 LENGTH must be a multiple of 8.
1952 Reads compressed strings if S is compressed.
1953 Returns true if successful, false if end of file is
1954 reached immediately. */
1956 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
1958 assert (length % 8 == 0);
1960 return try_read_bytes (r, s, length);
1964 for (ofs = 0; ofs < length; ofs += 8)
1965 if (!read_compressed_string (r, s + ofs))
1975 /* Skips LENGTH string bytes from R.
1976 LENGTH must be a multiple of 8.
1977 (LENGTH is also limited to 1024, but that's only because the
1978 current caller never needs more than that many bytes.)
1979 Returns true if successful, false if end of file is
1980 reached immediately. */
1982 skip_whole_strings (struct sfm_reader *r, size_t length)
1984 uint8_t buffer[1024];
1985 assert (length < sizeof buffer);
1986 return read_whole_strings (r, buffer, length);
1989 /* Creates and returns a table that can be used for translating a value
1990 index into a case to a "struct variable *" for DICT. Multiple
1991 system file fields reference variables this way.
1993 This table must be created before processing the very long
1994 string extension record, because that record causes some
1995 values to be deleted from the case and the dictionary to be
1997 static struct variable **
1998 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
2000 struct variable **var_by_value_idx;
2004 var_by_value_idx = pool_nmalloc (r->pool,
2005 r->oct_cnt, sizeof *var_by_value_idx);
2006 for (i = 0; i < dict_get_var_cnt (dict); i++)
2008 struct variable *v = dict_get_var (dict, i);
2009 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
2012 var_by_value_idx[value_idx++] = v;
2013 for (j = 1; j < nv; j++)
2014 var_by_value_idx[value_idx++] = NULL;
2016 assert (value_idx == r->oct_cnt);
2018 return var_by_value_idx;
2021 /* Returns the "struct variable" corresponding to the given
2022 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
2024 static struct variable *
2025 lookup_var_by_value_idx (struct sfm_reader *r,
2026 struct variable **var_by_value_idx, int value_idx)
2028 struct variable *var;
2030 if (value_idx < 1 || value_idx > r->oct_cnt)
2031 sys_error (r, _("Variable index %d not in valid range 1...%d."),
2032 value_idx, r->oct_cnt);
2034 var = var_by_value_idx[value_idx - 1];
2036 sys_error (r, _("Variable index %d refers to long string "
2043 /* Returns the variable in D with the given SHORT_NAME,
2044 or a null pointer if there is none. */
2045 static struct variable *
2046 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
2048 struct variable *var;
2052 /* First try looking up by full name. This often succeeds. */
2053 var = dict_lookup_var (d, short_name);
2054 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
2057 /* Iterate through the whole dictionary as a fallback. */
2058 var_cnt = dict_get_var_cnt (d);
2059 for (i = 0; i < var_cnt; i++)
2061 var = dict_get_var (d, i);
2062 if (!strcasecmp (var_get_short_name (var, 0), short_name))
2069 /* Helpers for reading records that contain structured text
2072 /* Maximum number of warnings to issue for a single text
2074 #define MAX_TEXT_WARNINGS 5
2079 struct substring buffer; /* Record contents. */
2080 size_t pos; /* Current position in buffer. */
2081 int n_warnings; /* Number of warnings issued or suppressed. */
2084 /* Reads SIZE bytes into a text record for R,
2085 and returns the new text record. */
2086 static struct text_record *
2087 open_text_record (struct sfm_reader *r, size_t size)
2089 struct text_record *text = pool_alloc (r->pool, sizeof *text);
2090 char *buffer = pool_malloc (r->pool, size + 1);
2091 read_bytes (r, buffer, size);
2092 text->buffer = ss_buffer (buffer, size);
2094 text->n_warnings = 0;
2098 /* Closes TEXT, frees its storage, and issues a final warning
2099 about suppressed warnings if necesary. */
2101 close_text_record (struct sfm_reader *r, struct text_record *text)
2103 if (text->n_warnings > MAX_TEXT_WARNINGS)
2104 sys_warn (r, _("Suppressed %d additional related warnings."),
2105 text->n_warnings - MAX_TEXT_WARNINGS);
2106 pool_free (r->pool, ss_data (text->buffer));
2109 /* Reads a variable=value pair from TEXT.
2110 Looks up the variable in DICT and stores it into *VAR.
2111 Stores a null-terminated value into *VALUE. */
2113 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2114 struct text_record *text,
2115 struct variable **var, char **value)
2119 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2122 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2126 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2127 ss_buffer ("\t\0", 2));
2135 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2136 struct text_record *text, struct substring delimiters,
2137 struct variable **var)
2139 char *short_name = text_get_token (text, delimiters, NULL);
2140 if (short_name == NULL)
2143 *var = lookup_var_by_short_name (dict, short_name);
2145 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2150 /* Displays a warning for the current file position, limiting the
2151 number to MAX_TEXT_WARNINGS for TEXT. */
2153 text_warn (struct sfm_reader *r, struct text_record *text,
2154 const char *format, ...)
2156 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2160 va_start (args, format);
2161 sys_msg (r, MW, format, args);
2167 text_get_token (struct text_record *text, struct substring delimiters,
2170 struct substring token;
2173 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2176 end = &ss_data (token)[ss_length (token)];
2177 if (delimiter != NULL)
2180 return ss_data (token);
2183 /* Reads a integer value expressed in decimal, then a space, then a string that
2184 consists of exactly as many bytes as specified by the integer, then a space,
2185 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2186 buffer (so the caller should not free the string). */
2188 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2198 int c = text->buffer.string[text->pos];
2199 if (c < '0' || c > '9')
2201 n = (n * 10) + (c - '0');
2204 if (start == text->pos)
2206 sys_warn (r, _("Expecting digit at offset %zu in MRSETS record."),
2211 if (!text_match (text, ' '))
2213 sys_warn (r, _("Expecting space at offset %zu in MRSETS record."),
2218 if (text->pos + n > text->buffer.length)
2220 sys_warn (r, _("%zu-byte string starting at offset %zu "
2221 "exceeds record length %zu."),
2222 n, text->pos, text->buffer.length);
2226 s = &text->buffer.string[text->pos];
2230 _("Expecting space at offset %zu following %zu-byte string."),
2240 text_match (struct text_record *text, char c)
2242 if (text->buffer.string[text->pos] == c)
2251 /* Returns the current byte offset inside the TEXT's string. */
2253 text_pos (const struct text_record *text)
2260 /* Displays a corruption message. */
2262 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
2267 ds_init_empty (&text);
2268 ds_put_format (&text, "\"%s\" near offset 0x%llx: ",
2269 fh_get_file_name (r->fh), (long long int) ftello (r->file));
2270 ds_put_vformat (&text, format, args);
2272 m.category = msg_class_to_category (class);
2273 m.severity = msg_class_to_severity (class);
2274 m.where.file_name = NULL;
2275 m.where.line_number = 0;
2276 m.text = ds_cstr (&text);
2281 /* Displays a warning for the current file position. */
2283 sys_warn (struct sfm_reader *r, const char *format, ...)
2287 va_start (args, format);
2288 sys_msg (r, MW, format, args);
2292 /* Displays an error for the current file position,
2293 marks it as in an error state,
2294 and aborts reading it using longjmp. */
2296 sys_error (struct sfm_reader *r, const char *format, ...)
2300 va_start (args, format);
2301 sys_msg (r, ME, format, args);
2305 longjmp (r->bail_out, 1);
2308 /* Reads BYTE_CNT bytes into BUF.
2309 Returns true if exactly BYTE_CNT bytes are successfully read.
2310 Aborts if an I/O error or a partial read occurs.
2311 If EOF_IS_OK, then an immediate end-of-file causes false to be
2312 returned; otherwise, immediate end-of-file causes an abort
2315 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2316 void *buf, size_t byte_cnt)
2318 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2319 if (bytes_read == byte_cnt)
2321 else if (ferror (r->file))
2322 sys_error (r, _("System error: %s."), strerror (errno));
2323 else if (!eof_is_ok || bytes_read != 0)
2324 sys_error (r, _("Unexpected end of file."));
2329 /* Reads BYTE_CNT into BUF.
2330 Aborts upon I/O error or if end-of-file is encountered. */
2332 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2334 read_bytes_internal (r, false, buf, byte_cnt);
2337 /* Reads BYTE_CNT bytes into BUF.
2338 Returns true if exactly BYTE_CNT bytes are successfully read.
2339 Returns false if an immediate end-of-file is encountered.
2340 Aborts if an I/O error or a partial read occurs. */
2342 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2344 return read_bytes_internal (r, true, buf, byte_cnt);
2347 /* Reads a 32-bit signed integer from R and returns its value in
2350 read_int (struct sfm_reader *r)
2353 read_bytes (r, integer, sizeof integer);
2354 return integer_get (r->integer_format, integer, sizeof integer);
2357 /* Reads a 64-bit floating-point number from R and returns its
2358 value in host format. */
2360 read_float (struct sfm_reader *r)
2363 read_bytes (r, number, sizeof number);
2364 return float_get_double (r->float_format, number);
2367 /* Reads exactly SIZE - 1 bytes into BUFFER
2368 and stores a null byte into BUFFER[SIZE - 1]. */
2370 read_string (struct sfm_reader *r, char *buffer, size_t size)
2373 read_bytes (r, buffer, size - 1);
2374 buffer[size - 1] = '\0';
2377 /* Skips BYTES bytes forward in R. */
2379 skip_bytes (struct sfm_reader *r, size_t bytes)
2384 size_t chunk = MIN (sizeof buffer, bytes);
2385 read_bytes (r, buffer, chunk);
2390 static const struct casereader_class sys_file_casereader_class =
2392 sys_file_casereader_read,
2393 sys_file_casereader_destroy,