1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/mrset.h"
38 #include "data/short-names.h"
39 #include "data/value-labels.h"
40 #include "data/value.h"
41 #include "data/variable.h"
42 #include "libpspp/array.h"
43 #include "libpspp/assertion.h"
44 #include "libpspp/compiler.h"
45 #include "libpspp/i18n.h"
46 #include "libpspp/message.h"
47 #include "libpspp/misc.h"
48 #include "libpspp/pool.h"
49 #include "libpspp/str.h"
50 #include "libpspp/stringi-set.h"
52 #include "gl/c-ctype.h"
53 #include "gl/inttostr.h"
54 #include "gl/minmax.h"
55 #include "gl/unlocked-io.h"
56 #include "gl/xalloc.h"
60 #define _(msgid) gettext (msgid)
61 #define N_(msgid) (msgid)
63 /* System file reader. */
66 /* Resource tracking. */
67 struct pool *pool; /* All system file state. */
68 jmp_buf bail_out; /* longjmp() target for error handling. */
71 struct file_handle *fh; /* File handle. */
72 struct fh_lock *lock; /* Mutual exclusion for file handle. */
73 FILE *file; /* File stream. */
74 bool error; /* I/O or corruption error? */
75 struct caseproto *proto; /* Format of output cases. */
78 enum integer_format integer_format; /* On-disk integer format. */
79 enum float_format float_format; /* On-disk floating point format. */
80 int oct_cnt; /* Number of 8-byte units per case. */
81 struct sfm_var *sfm_vars; /* Variables. */
82 size_t sfm_var_cnt; /* Number of variables. */
83 casenumber case_cnt; /* Number of cases */
84 bool has_long_var_names; /* File has a long variable name map */
87 bool compressed; /* File is compressed? */
88 double bias; /* Compression bias, usually 100.0. */
89 uint8_t opcodes[8]; /* Current block of opcodes. */
90 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 bool corruption_warning; /* Warned about possible corruption? */
94 static const struct casereader_class sys_file_casereader_class;
96 static bool close_reader (struct sfm_reader *);
98 static struct variable **make_var_by_value_idx (struct sfm_reader *,
100 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
103 static struct variable *lookup_var_by_short_name (struct dictionary *,
104 const char *short_name);
106 static void sys_msg (struct sfm_reader *r, int class,
107 const char *format, va_list args)
108 PRINTF_FORMAT (3, 0);
109 static void sys_warn (struct sfm_reader *, const char *, ...)
110 PRINTF_FORMAT (2, 3);
111 static void sys_error (struct sfm_reader *, const char *, ...)
115 static void read_bytes (struct sfm_reader *, void *, size_t);
116 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
117 static int read_int (struct sfm_reader *);
118 static double read_float (struct sfm_reader *);
119 static void read_string (struct sfm_reader *, char *, size_t);
120 static void skip_bytes (struct sfm_reader *, size_t);
122 static struct text_record *open_text_record (struct sfm_reader *, size_t size);
123 static void close_text_record (struct sfm_reader *r,
124 struct text_record *);
125 static bool read_variable_to_value_pair (struct sfm_reader *,
127 struct text_record *,
128 struct variable **var, char **value);
129 static void text_warn (struct sfm_reader *r, struct text_record *text,
130 const char *format, ...)
131 PRINTF_FORMAT (3, 4);
132 static char *text_get_token (struct text_record *,
133 struct substring delimiters, char *delimiter);
134 static bool text_match (struct text_record *, char c);
135 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
136 struct text_record *,
137 struct substring delimiters,
139 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
140 struct text_record *,
141 struct substring delimiters,
143 static const char *text_parse_counted_string (struct sfm_reader *,
144 struct text_record *);
145 static size_t text_pos (const struct text_record *);
147 static bool close_reader (struct sfm_reader *r);
149 /* Dictionary reader. */
157 static void read_header (struct sfm_reader *, struct dictionary *,
158 int *weight_idx, int *claimed_oct_cnt,
159 struct sfm_read_info *);
160 static void read_variable_record (struct sfm_reader *, struct dictionary *,
161 int *format_warning_cnt);
162 static void parse_format_spec (struct sfm_reader *, unsigned int,
163 enum which_format, struct variable *,
164 int *format_warning_cnt);
165 static void setup_weight (struct sfm_reader *, int weight_idx,
166 struct variable **var_by_value_idx,
167 struct dictionary *);
168 static void read_documents (struct sfm_reader *, struct dictionary *);
169 static void read_value_labels (struct sfm_reader *, struct dictionary *,
170 struct variable **var_by_value_idx);
172 static void read_extension_record (struct sfm_reader *, struct dictionary *,
173 struct sfm_read_info *);
174 static void read_machine_integer_info (struct sfm_reader *,
175 size_t size, size_t count,
176 struct sfm_read_info *,
179 static void read_machine_float_info (struct sfm_reader *,
180 size_t size, size_t count);
181 static void read_mrsets (struct sfm_reader *, size_t size, size_t count,
182 struct dictionary *);
183 static void read_display_parameters (struct sfm_reader *,
184 size_t size, size_t count,
185 struct dictionary *);
186 static void read_long_var_name_map (struct sfm_reader *,
187 size_t size, size_t count,
188 struct dictionary *);
189 static void read_long_string_map (struct sfm_reader *,
190 size_t size, size_t count,
191 struct dictionary *);
192 static void read_data_file_attributes (struct sfm_reader *,
193 size_t size, size_t count,
194 struct dictionary *);
195 static void read_variable_attributes (struct sfm_reader *,
196 size_t size, size_t count,
197 struct dictionary *);
198 static void read_long_string_value_labels (struct sfm_reader *,
199 size_t size, size_t count,
200 struct dictionary *);
202 /* Convert all the strings in DICT from the dict encoding to UTF8 */
204 recode_strings (struct dictionary *dict)
208 const char *enc = dict_get_encoding (dict);
211 enc = get_default_encoding ();
213 for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
215 /* Convert the long variable name */
216 struct variable *var = dict_get_var (dict, i);
217 const char *native_name = var_get_name (var);
218 char *utf8_name = recode_string (UTF8, enc, native_name, -1);
219 if ( 0 != strcmp (utf8_name, native_name))
221 if ( NULL == dict_lookup_var (dict, utf8_name))
222 dict_rename_var (dict, var, utf8_name);
225 _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name);
230 /* Convert the variable label */
231 if (var_has_label (var))
233 char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
234 var_set_label (var, utf8_label);
238 if (var_has_value_labels (var))
240 const struct val_lab *vl = NULL;
241 const struct val_labs *vlabs = var_get_value_labels (var);
243 for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl))
245 const union value *val = val_lab_get_value (vl);
246 const char *label = val_lab_get_label (vl);
247 char *new_label = NULL;
249 new_label = recode_string (UTF8, enc, label, -1);
251 var_replace_value_label (var, val, new_label);
258 /* Opens the system file designated by file handle FH for
259 reading. Reads the system file's dictionary into *DICT.
260 If INFO is non-null, then it receives additional info about the
263 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
264 struct sfm_read_info *volatile info)
266 struct sfm_reader *volatile r = NULL;
267 struct variable **var_by_value_idx;
268 struct sfm_read_info local_info;
269 int format_warning_cnt = 0;
274 *dict = dict_create ();
276 /* Create and initialize reader. */
277 r = pool_create_container (struct sfm_reader, pool);
283 r->has_long_var_names = false;
284 r->opcode_idx = sizeof r->opcodes;
285 r->corruption_warning = false;
287 /* TRANSLATORS: this fragment will be interpolated into
288 messages in fh_lock() that identify types of files. */
289 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
293 r->file = fn_open (fh_get_file_name (fh), "rb");
296 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
297 fh_get_file_name (r->fh), strerror (errno));
301 /* Initialize info. */
304 memset (info, 0, sizeof *info);
306 if (setjmp (r->bail_out))
311 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
313 /* Read all the variable definition records. */
314 rec_type = read_int (r);
315 while (rec_type == 2)
317 read_variable_record (r, *dict, &format_warning_cnt);
318 rec_type = read_int (r);
321 /* Figure out the case format. */
322 var_by_value_idx = make_var_by_value_idx (r, *dict);
323 setup_weight (r, weight_idx, var_by_value_idx, *dict);
325 /* Read all the rest of the dictionary records. */
326 while (rec_type != 999)
331 read_value_labels (r, *dict, var_by_value_idx);
335 sys_error (r, _("Misplaced type 4 record."));
338 read_documents (r, *dict);
342 read_extension_record (r, *dict, info);
346 sys_error (r, _("Unrecognized record type %d."), rec_type);
348 rec_type = read_int (r);
352 if ( ! r->has_long_var_names )
355 for (i = 0; i < dict_get_var_cnt (*dict); i++)
357 struct variable *var = dict_get_var (*dict, i);
358 char short_name[SHORT_NAME_LEN + 1];
359 char long_name[SHORT_NAME_LEN + 1];
361 strcpy (short_name, var_get_name (var));
363 strcpy (long_name, short_name);
364 str_lowercase (long_name);
366 /* Set long name. Renaming a variable may clear the short
367 name, but we want to retain it, so re-set it
369 dict_rename_var (*dict, var, long_name);
370 var_set_short_name (var, 0, short_name);
373 r->has_long_var_names = true;
376 recode_strings (*dict);
378 /* Read record 999 data, which is just filler. */
381 /* Warn if the actual amount of data per case differs from the
382 amount that the header claims. SPSS version 13 gets this
383 wrong when very long strings are involved, so don't warn in
385 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
386 && info->version_major != 13)
387 sys_warn (r, _("File header claims %d variable positions but "
388 "%d were read from file."),
389 claimed_oct_cnt, r->oct_cnt);
391 /* Create an index of dictionary variable widths for
392 sfm_read_case to use. We cannot use the `struct variable's
393 from the dictionary we created, because the caller owns the
394 dictionary and may destroy or modify its variables. */
395 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
396 pool_register (r->pool, free, r->sfm_vars);
397 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
399 pool_free (r->pool, var_by_value_idx);
400 return casereader_create_sequential
402 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
403 &sys_file_casereader_class, r);
407 dict_destroy (*dict);
412 /* Closes a system file after we're done with it.
413 Returns true if an I/O error has occurred on READER, false
416 close_reader (struct sfm_reader *r)
425 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
427 msg (ME, _("Error closing system file `%s': %s."),
428 fh_get_file_name (r->fh), strerror (errno));
438 pool_destroy (r->pool);
443 /* Destroys READER. */
445 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
447 struct sfm_reader *r = r_;
451 /* Returns true if FILE is an SPSS system file,
454 sfm_detect (FILE *file)
458 if (fread (rec_type, 4, 1, file) != 1)
462 return !strcmp ("$FL2", rec_type);
465 /* Reads the global header of the system file.
466 Sets DICT's file label to the system file's label.
467 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
468 or to the value index of the weight variable otherwise.
469 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
470 per case that the file claims to have (although it is not
472 Initializes INFO with header information. */
474 read_header (struct sfm_reader *r, struct dictionary *dict,
475 int *weight_idx, int *claimed_oct_cnt,
476 struct sfm_read_info *info)
479 char eye_catcher[61];
480 uint8_t raw_layout_code[4];
482 char creation_date[10];
483 char creation_time[9];
485 struct substring file_label_ss;
486 struct substring product;
488 read_string (r, rec_type, sizeof rec_type);
489 read_string (r, eye_catcher, sizeof eye_catcher);
491 if (strcmp ("$FL2", rec_type) != 0)
492 sys_error (r, _("This is not an SPSS system file."));
494 /* Identify integer format. */
495 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
496 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
498 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
500 || (r->integer_format != INTEGER_MSB_FIRST
501 && r->integer_format != INTEGER_LSB_FIRST))
502 sys_error (r, _("This is not an SPSS system file."));
504 *claimed_oct_cnt = read_int (r);
505 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
506 *claimed_oct_cnt = -1;
508 r->compressed = read_int (r) != 0;
510 *weight_idx = read_int (r);
512 r->case_cnt = read_int (r);
513 if ( r->case_cnt > INT_MAX / 2)
516 /* Identify floating-point format and obtain compression bias. */
517 read_bytes (r, raw_bias, sizeof raw_bias);
518 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
520 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
522 if (memcmp (raw_bias, zero_bias, 8))
523 sys_warn (r, _("Compression bias is not the usual "
524 "value of 100, or system file uses unrecognized "
525 "floating-point format."));
528 /* Some software is known to write all-zeros to this
529 field. Such software also writes floating-point
530 numbers in the format that we expect by default
531 (it seems that all software most likely does, in
532 reality), so don't warn in this case. */
535 if (r->integer_format == INTEGER_MSB_FIRST)
536 r->float_format = FLOAT_IEEE_DOUBLE_BE;
538 r->float_format = FLOAT_IEEE_DOUBLE_LE;
540 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
542 read_string (r, creation_date, sizeof creation_date);
543 read_string (r, creation_time, sizeof creation_time);
544 read_string (r, file_label, sizeof file_label);
547 file_label_ss = ss_cstr (file_label);
548 ss_trim (&file_label_ss, ss_cstr (" "));
549 if (!ss_is_empty (file_label_ss))
551 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
552 dict_set_label (dict, ss_data (file_label_ss));
555 strcpy (info->creation_date, creation_date);
556 strcpy (info->creation_time, creation_time);
557 info->integer_format = r->integer_format;
558 info->float_format = r->float_format;
559 info->compressed = r->compressed;
560 info->case_cnt = r->case_cnt;
562 product = ss_cstr (eye_catcher);
563 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
564 ss_trim (&product, ss_cstr (" "));
565 str_copy_buf_trunc (info->product, sizeof info->product,
566 ss_data (product), ss_length (product));
569 /* Reads a variable (type 2) record from R and adds the
570 corresponding variable to DICT.
571 Also skips past additional variable records for long string
574 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
575 int *format_warning_cnt)
578 int has_variable_label;
579 int missing_value_code;
584 struct variable *var;
587 width = read_int (r);
588 has_variable_label = read_int (r);
589 missing_value_code = read_int (r);
590 print_format = read_int (r);
591 write_format = read_int (r);
592 read_string (r, name, sizeof name);
593 name[strcspn (name, " ")] = '\0';
595 /* Check variable name. */
596 if (name[0] == '$' || name[0] == '#')
597 sys_error (r, _("Variable name begins with invalid character `%c'."),
599 if (!var_is_plausible_name (name, false))
600 sys_error (r, _("Invalid variable name `%s'."), name);
602 /* Create variable. */
603 if (width < 0 || width > 255)
604 sys_error (r, _("Bad width %d for variable %s."), width, name);
605 var = dict_create_var (dict, name, width);
607 sys_error (r, _("Duplicate variable name `%s'."), name);
609 /* Set the short name the same as the long name. */
610 var_set_short_name (var, 0, var_get_name (var));
612 /* Get variable label, if any. */
613 if (has_variable_label != 0 && has_variable_label != 1)
614 sys_error (r, _("Variable label indicator field is not 0 or 1."));
615 if (has_variable_label == 1)
617 size_t len, read_len;
622 /* Read up to 255 bytes of label. */
623 read_len = MIN (sizeof label - 1, len);
624 read_string (r, label, read_len + 1);
625 var_set_label (var, label);
627 /* Skip unread label bytes. */
628 skip_bytes (r, len - read_len);
630 /* Skip label padding up to multiple of 4 bytes. */
631 skip_bytes (r, ROUND_UP (len, 4) - len);
634 /* Set missing values. */
635 if (missing_value_code != 0)
637 struct missing_values mv;
640 mv_init_pool (r->pool, &mv, var_get_width (var));
641 if (var_is_numeric (var))
643 if (missing_value_code < -3 || missing_value_code > 3
644 || missing_value_code == -1)
645 sys_error (r, _("Numeric missing value indicator field is not "
646 "-3, -2, 0, 1, 2, or 3."));
647 if (missing_value_code < 0)
649 double low = read_float (r);
650 double high = read_float (r);
651 mv_add_range (&mv, low, high);
652 missing_value_code = -missing_value_code - 2;
654 for (i = 0; i < missing_value_code; i++)
655 mv_add_num (&mv, read_float (r));
659 int mv_width = MAX (width, 8);
662 if (missing_value_code < 1 || missing_value_code > 3)
663 sys_error (r, _("String missing value indicator field is not "
666 value_init (&value, mv_width);
667 value_set_missing (&value, mv_width);
668 for (i = 0; i < missing_value_code; i++)
670 uint8_t *s = value_str_rw (&value, mv_width);
671 read_bytes (r, s, 8);
674 value_destroy (&value, mv_width);
676 var_set_missing_values (var, &mv);
680 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
681 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
683 /* Account for values.
684 Skip long string continuation records, if any. */
685 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
691 for (i = 1; i < nv; i++)
693 /* Check for record type 2 and width -1. */
694 if (read_int (r) != 2 || read_int (r) != -1)
695 sys_error (r, _("Missing string continuation record."));
697 /* Skip and ignore remaining continuation data. */
698 has_variable_label = read_int (r);
699 missing_value_code = read_int (r);
700 print_format = read_int (r);
701 write_format = read_int (r);
702 read_string (r, name, sizeof name);
704 /* Variable label fields on continuation records have
705 been spotted in system files created by "SPSS Power
706 Macintosh Release 6.1". */
707 if (has_variable_label)
708 skip_bytes (r, ROUND_UP (read_int (r), 4));
713 /* Translates the format spec from sysfile format to internal
716 parse_format_spec (struct sfm_reader *r, unsigned int s,
717 enum which_format which, struct variable *v,
718 int *format_warning_cnt)
720 const int max_format_warnings = 8;
722 uint8_t raw_type = s >> 16;
728 if (!fmt_from_io (raw_type, &f.type))
729 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
734 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
739 if (which == PRINT_FORMAT)
740 var_set_print_format (v, &f);
742 var_set_write_format (v, &f);
744 else if (++*format_warning_cnt <= max_format_warnings)
746 char fmt_string[FMT_STRING_LEN_MAX + 1];
747 sys_warn (r, _("%s variable %s has invalid %s format %s."),
748 var_is_numeric (v) ? _("Numeric") : _("String"),
750 which == PRINT_FORMAT ? _("print") : _("write"),
751 fmt_to_string (&f, fmt_string));
753 if (*format_warning_cnt == max_format_warnings)
754 sys_warn (r, _("Suppressing further invalid format warnings."));
758 /* Sets the weighting variable in DICT to the variable
759 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
762 setup_weight (struct sfm_reader *r, int weight_idx,
763 struct variable **var_by_value_idx, struct dictionary *dict)
767 struct variable *weight_var
768 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
769 if (var_is_numeric (weight_var))
770 dict_set_weight (dict, weight_var);
772 sys_error (r, _("Weighting variable must be numeric "
773 "(not string variable `%s')."),
774 var_get_name (weight_var));
778 /* Reads a document record, type 6, from system file R, and sets up
779 the documents and n_documents fields in the associated
782 read_documents (struct sfm_reader *r, struct dictionary *dict)
787 if (dict_get_documents (dict) != NULL)
788 sys_error (r, _("Multiple type 6 (document) records."));
790 line_cnt = read_int (r);
792 sys_error (r, _("Number of document lines (%d) "
793 "must be greater than 0."), line_cnt);
795 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
796 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
797 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
798 dict_set_documents (dict, documents);
800 sys_error (r, _("Document line contains null byte."));
801 pool_free (r->pool, documents);
804 /* Read a type 7 extension record. */
806 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
807 struct sfm_read_info *info)
809 int subtype = read_int (r);
810 size_t size = read_int (r);
811 size_t count = read_int (r);
812 size_t bytes = size * count;
814 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
815 allows an extra byte for a null terminator, used by some
816 extension processing routines. */
817 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
818 sys_error (r, "Record type 7 subtype %d too large.", subtype);
823 read_machine_integer_info (r, size, count, info, dict);
827 read_machine_float_info (r, size, count);
831 /* Variable sets information. We don't use these yet.
832 They only apply to GUIs; see VARSETS on the APPLY
833 DICTIONARY command in SPSS documentation. */
837 /* DATE variable information. We don't use it yet, but we
843 read_mrsets (r, size, count, dict);
847 /* Used by the SPSS Data Entry software. */
851 read_display_parameters (r, size, count, dict);
855 read_long_var_name_map (r, size, count, dict);
859 read_long_string_map (r, size, count, dict);
863 /* Extended number of cases. Not important. */
867 read_data_file_attributes (r, size, count, dict);
871 read_variable_attributes (r, size, count, dict);
875 /* New in SPSS 16. Contains a single string that describes
876 the character encoding, e.g. "windows-1252". */
878 char *encoding = pool_calloc (r->pool, size, count + 1);
879 read_string (r, encoding, count + 1);
880 dict_set_encoding (dict, encoding);
885 /* New in SPSS 16. Encodes value labels for long string
887 read_long_string_value_labels (r, size, count, dict);
891 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send "
892 "a copy of this file, and the syntax which created it "
894 subtype, PACKAGE_BUGREPORT);
898 skip_bytes (r, bytes);
901 /* Read record type 7, subtype 3. */
903 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
904 struct sfm_read_info *info,
905 struct dictionary *dict)
907 int version_major = read_int (r);
908 int version_minor = read_int (r);
909 int version_revision = read_int (r);
910 int machine_code UNUSED = read_int (r);
911 int float_representation = read_int (r);
912 int compression_code UNUSED = read_int (r);
913 int integer_representation = read_int (r);
914 int character_code = read_int (r);
916 int expected_float_format;
917 int expected_integer_format;
919 if (size != 4 || count != 8)
920 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
924 /* Save version info. */
925 info->version_major = version_major;
926 info->version_minor = version_minor;
927 info->version_revision = version_revision;
929 /* Check floating point format. */
930 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
931 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
932 expected_float_format = 1;
933 else if (r->float_format == FLOAT_Z_LONG)
934 expected_float_format = 2;
935 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
936 expected_float_format = 3;
939 if (float_representation != expected_float_format)
940 sys_error (r, _("Floating-point representation indicated by "
941 "system file (%d) differs from expected (%d)."),
942 float_representation, expected_float_format);
944 /* Check integer format. */
945 if (r->integer_format == INTEGER_MSB_FIRST)
946 expected_integer_format = 1;
947 else if (r->integer_format == INTEGER_LSB_FIRST)
948 expected_integer_format = 2;
951 if (integer_representation != expected_integer_format)
952 sys_warn (r, _("Integer format indicated by system file (%d) "
953 "differs from expected (%d)."),
954 integer_representation, expected_integer_format);
957 Record 7 (20) provides a much more reliable way of
958 setting the encoding.
959 The character_code is used as a fallback only.
961 if ( NULL == dict_get_encoding (dict))
963 switch (character_code)
966 dict_set_encoding (dict, "EBCDIC-US");
970 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
971 respectively. However, there are known to be many files
972 in the wild with character code 2, yet have data which are
974 Therefore we ignore these values.
978 dict_set_encoding (dict, "MS_KANJI");
981 dict_set_encoding (dict, "UTF-7");
984 dict_set_encoding (dict, "UTF-8");
989 snprintf (enc, 100, "CP%d", character_code);
990 dict_set_encoding (dict, enc);
997 /* Read record type 7, subtype 4. */
999 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
1001 double sysmis = read_float (r);
1002 double highest = read_float (r);
1003 double lowest = read_float (r);
1005 if (size != 8 || count != 3)
1006 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
1009 if (sysmis != SYSMIS)
1010 sys_warn (r, _("File specifies unexpected value %g as %s."),
1013 if (highest != HIGHEST)
1014 sys_warn (r, _("File specifies unexpected value %g as %s."),
1015 highest, "HIGHEST");
1017 if (lowest != LOWEST)
1018 sys_warn (r, _("File specifies unexpected value %g as %s."),
1022 /* Read record type 7, subtype 7 or 19. */
1024 read_mrsets (struct sfm_reader *r, size_t size, size_t count,
1025 struct dictionary *dict)
1027 struct text_record *text;
1028 struct mrset *mrset;
1030 text = open_text_record (r, size * count);
1033 const char *name, *label, *counted;
1034 struct stringi_set var_names;
1035 size_t allocated_vars;
1039 mrset = xzalloc (sizeof *mrset);
1041 name = text_get_token (text, ss_cstr ("="), NULL);
1044 mrset->name = xstrdup (name);
1046 if (mrset->name[0] != '$')
1048 sys_warn (r, _("`%s' does not begin with `$' at offset %zu "
1049 "in MRSETS record."), mrset->name, text_pos (text));
1053 if (text_match (text, 'C'))
1055 mrset->type = MRSET_MC;
1056 if (!text_match (text, ' '))
1058 sys_warn (r, _("Missing space following `%c' at offset %zu "
1059 "in MRSETS record."), 'C', text_pos (text));
1063 else if (text_match (text, 'D'))
1065 mrset->type = MRSET_MD;
1066 mrset->cat_source = MRSET_VARLABELS;
1068 else if (text_match (text, 'E'))
1072 mrset->type = MRSET_MD;
1073 mrset->cat_source = MRSET_COUNTEDVALUES;
1074 if (!text_match (text, ' '))
1076 sys_warn (r, _("Missing space following `%c' at offset %zu "
1077 "in MRSETS record."), 'E', text_pos (text));
1081 number = text_get_token (text, ss_cstr (" "), NULL);
1082 if (!strcmp (number, "11"))
1083 mrset->label_from_var_label = true;
1084 else if (strcmp (number, "1"))
1085 sys_warn (r, _("Unexpected label source value `%s' "
1086 "following `E' at offset %zu in MRSETS record."),
1087 number, text_pos (text));
1091 sys_warn (r, _("Missing `C', `D', or `E' at offset %zu "
1092 "in MRSETS record."),
1097 if (mrset->type == MRSET_MD)
1099 counted = text_parse_counted_string (r, text);
1100 if (counted == NULL)
1104 label = text_parse_counted_string (r, text);
1107 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1109 stringi_set_init (&var_names);
1114 struct variable *var;
1115 const char *var_name;
1117 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1118 if (var_name == NULL)
1120 sys_warn (r, _("Missing new-line parsing variable names "
1121 "at offset %zu in MRSETS record."),
1126 var = lookup_var_by_short_name (dict, var_name);
1129 if (!stringi_set_insert (&var_names, var_name))
1131 sys_warn (r, _("Duplicate variable name %s "
1132 "at offset %zu in MRSETS record."),
1133 var_name, text_pos (text));
1137 if (mrset->label == NULL && mrset->label_from_var_label
1138 && var_has_label (var))
1139 mrset->label = xstrdup (var_get_label (var));
1142 && var_get_type (var) != var_get_type (mrset->vars[0]))
1144 sys_warn (r, _("MRSET %s contains both string and "
1145 "numeric variables."), name);
1148 width = MIN (width, var_get_width (var));
1150 if (mrset->n_vars >= allocated_vars)
1151 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1152 sizeof *mrset->vars);
1153 mrset->vars[mrset->n_vars++] = var;
1155 while (delimiter != '\n');
1157 if (mrset->n_vars < 2)
1159 sys_warn (r, _("MRSET %s has only %zu variables."), mrset->name,
1161 mrset_destroy (mrset);
1165 if (mrset->type == MRSET_MD)
1167 mrset->width = width;
1168 value_init (&mrset->counted, width);
1170 mrset->counted.f = strtod (counted, NULL);
1172 value_copy_str_rpad (&mrset->counted, width,
1173 (const uint8_t *) counted, ' ');
1176 dict_add_mrset (dict, mrset);
1178 stringi_set_destroy (&var_names);
1180 mrset_destroy (mrset);
1181 close_text_record (r, text);
1184 /* Read record type 7, subtype 11, which specifies how variables
1185 should be displayed in GUI environments. */
1187 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
1188 struct dictionary *dict)
1191 bool includes_width;
1192 bool warned = false;
1197 sys_warn (r, _("Bad size %zu on extension 11."), size);
1198 skip_bytes (r, size * count);
1202 n_vars = dict_get_var_cnt (dict);
1203 if (count == 3 * n_vars)
1204 includes_width = true;
1205 else if (count == 2 * n_vars)
1206 includes_width = false;
1209 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
1211 skip_bytes (r, size * count);
1215 for (i = 0; i < n_vars; ++i)
1217 struct variable *v = dict_get_var (dict, i);
1218 int measure = read_int (r);
1219 int width = includes_width ? read_int (r) : 0;
1220 int align = read_int (r);
1222 /* SPSS 14 sometimes seems to set string variables' measure
1224 if (0 == measure && var_is_alpha (v))
1227 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1230 sys_warn (r, _("Invalid variable display parameters "
1231 "for variable %zu (%s). "
1232 "Default parameters substituted."),
1233 i, var_get_name (v));
1238 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1239 : measure == 2 ? MEASURE_ORDINAL
1241 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1242 : align == 1 ? ALIGN_RIGHT
1245 /* Older versions (SPSS 9.0) sometimes set the display
1246 width to zero. This causes confusion in the GUI, so
1247 only set the width if it is nonzero. */
1249 var_set_display_width (v, width);
1253 /* Reads record type 7, subtype 13, which gives the long name
1254 that corresponds to each short name. Modifies variable names
1255 in DICT accordingly. */
1257 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
1258 struct dictionary *dict)
1260 struct text_record *text;
1261 struct variable *var;
1264 text = open_text_record (r, size * count);
1265 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1268 size_t short_name_cnt;
1271 /* Validate long name. */
1272 if (!var_is_valid_name (long_name, false))
1274 sys_warn (r, _("Long variable mapping from %s to invalid "
1275 "variable name `%s'."),
1276 var_get_name (var), long_name);
1280 /* Identify any duplicates. */
1281 if (strcasecmp (var_get_short_name (var, 0), long_name)
1282 && dict_lookup_var (dict, long_name) != NULL)
1284 sys_warn (r, _("Duplicate long variable name `%s'."), long_name);
1288 /* Renaming a variable may clear its short names, but we
1289 want to retain them, so we save them and re-set them
1291 short_name_cnt = var_get_short_name_cnt (var);
1292 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
1293 for (i = 0; i < short_name_cnt; i++)
1295 const char *s = var_get_short_name (var, i);
1296 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1299 /* Set long name. */
1300 dict_rename_var (dict, var, long_name);
1302 /* Restore short names. */
1303 for (i = 0; i < short_name_cnt; i++)
1305 var_set_short_name (var, i, short_names[i]);
1306 free (short_names[i]);
1310 close_text_record (r, text);
1311 r->has_long_var_names = true;
1314 /* Reads record type 7, subtype 14, which gives the real length
1315 of each very long string. Rearranges DICT accordingly. */
1317 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1318 struct dictionary *dict)
1320 struct text_record *text;
1321 struct variable *var;
1324 text = open_text_record (r, size * count);
1325 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1327 size_t idx = var_get_dict_index (var);
1333 length = strtol (length_s, NULL, 10);
1334 if (length < 1 || length > MAX_STRING)
1336 sys_warn (r, _("%s listed as string of invalid length %s "
1337 "in very long string record."),
1338 var_get_name (var), length_s);
1342 /* Check segments. */
1343 segment_cnt = sfm_width_to_segments (length);
1344 if (segment_cnt == 1)
1346 sys_warn (r, _("%s listed in very long string record with width %s, "
1347 "which requires only one segment."),
1348 var_get_name (var), length_s);
1351 if (idx + segment_cnt > dict_get_var_cnt (dict))
1352 sys_error (r, _("Very long string %s overflows dictionary."),
1353 var_get_name (var));
1355 /* Get the short names from the segments and check their
1357 for (i = 0; i < segment_cnt; i++)
1359 struct variable *seg = dict_get_var (dict, idx + i);
1360 int alloc_width = sfm_segment_alloc_width (length, i);
1361 int width = var_get_width (seg);
1364 var_set_short_name (var, i, var_get_short_name (seg, 0));
1365 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1366 sys_error (r, _("Very long string with width %ld has segment %d "
1367 "of width %d (expected %d)."),
1368 length, i, width, alloc_width);
1370 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1371 var_set_width (var, length);
1373 close_text_record (r, text);
1374 dict_compact_values (dict);
1377 /* Reads value labels from sysfile H and inserts them into the
1378 associated dictionary. */
1380 read_value_labels (struct sfm_reader *r,
1381 struct dictionary *dict, struct variable **var_by_value_idx)
1383 struct pool *subpool;
1387 uint8_t raw_value[8]; /* Value as uninterpreted bytes. */
1388 union value value; /* Value. */
1389 char *label; /* Null-terminated label string. */
1392 struct label *labels = NULL;
1393 int label_cnt; /* Number of labels. */
1395 struct variable **var = NULL; /* Associated variables. */
1396 int var_cnt; /* Number of associated variables. */
1397 int max_width; /* Maximum width of string variables. */
1401 subpool = pool_create_subpool (r->pool);
1403 /* Read the type 3 record and record its contents. We can't do
1404 much with the data yet because we don't know whether it is
1405 of numeric or string type. */
1407 /* Read number of labels. */
1408 label_cnt = read_int (r);
1410 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1412 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1417 /* Read each value/label tuple into labels[]. */
1418 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1419 for (i = 0; i < label_cnt; i++)
1421 struct label *label = labels + i;
1422 unsigned char label_len;
1426 read_bytes (r, label->raw_value, sizeof label->raw_value);
1428 /* Read label length. */
1429 read_bytes (r, &label_len, sizeof label_len);
1430 padded_len = ROUND_UP (label_len + 1, 8);
1432 /* Read label, padding. */
1433 label->label = pool_alloc (subpool, padded_len + 1);
1434 read_bytes (r, label->label, padded_len - 1);
1435 label->label[label_len] = 0;
1438 /* Now, read the type 4 record that has the list of variables
1439 to which the value labels are to be applied. */
1441 /* Read record type of type 4 record. */
1442 if (read_int (r) != 4)
1443 sys_error (r, _("Variable index record (type 4) does not immediately "
1444 "follow value label record (type 3) as it should."));
1446 /* Read number of variables associated with value label from type 4
1448 var_cnt = read_int (r);
1449 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1450 sys_error (r, _("Number of variables associated with a value label (%d) "
1451 "is not between 1 and the number of variables (%zu)."),
1452 var_cnt, dict_get_var_cnt (dict));
1454 /* Read the list of variables. */
1455 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1457 for (i = 0; i < var_cnt; i++)
1459 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1460 if (var_get_width (var[i]) > 8)
1461 sys_error (r, _("Value labels may not be added to long string "
1462 "variables (e.g. %s) using records types 3 and 4."),
1463 var_get_name (var[i]));
1464 max_width = MAX (max_width, var_get_width (var[i]));
1467 /* Type check the variables. */
1468 for (i = 1; i < var_cnt; i++)
1469 if (var_get_type (var[i]) != var_get_type (var[0]))
1470 sys_error (r, _("Variables associated with value label are not all of "
1471 "identical type. Variable %s is %s, but variable "
1473 var_get_name (var[0]),
1474 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1475 var_get_name (var[i]),
1476 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1478 /* Fill in labels[].value, now that we know the desired type. */
1479 for (i = 0; i < label_cnt; i++)
1481 struct label *label = labels + i;
1483 value_init_pool (subpool, &label->value, max_width);
1484 if (var_is_alpha (var[0]))
1485 u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
1486 label->raw_value, sizeof label->raw_value, ' ');
1488 label->value.f = float_get_double (r->float_format, label->raw_value);
1491 /* Assign the `value_label's to each variable. */
1492 for (i = 0; i < var_cnt; i++)
1494 struct variable *v = var[i];
1497 /* Add each label to the variable. */
1498 for (j = 0; j < label_cnt; j++)
1500 struct label *label = &labels[j];
1501 if (!var_add_value_label (v, &label->value, label->label))
1503 if (var_is_numeric (var[0]))
1504 sys_warn (r, _("Duplicate value label for %g on %s."),
1505 label->value.f, var_get_name (v));
1507 sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
1508 max_width, value_str (&label->value, max_width),
1514 pool_destroy (subpool);
1517 /* Reads a set of custom attributes from TEXT into ATTRS.
1518 ATTRS may be a null pointer, in which case the attributes are
1519 read but discarded. */
1521 read_attributes (struct sfm_reader *r, struct text_record *text,
1522 struct attrset *attrs)
1526 struct attribute *attr;
1530 /* Parse the key. */
1531 key = text_get_token (text, ss_cstr ("("), NULL);
1535 attr = attribute_create (key);
1536 for (index = 1; ; index++)
1538 /* Parse the value. */
1542 value = text_get_token (text, ss_cstr ("\n"), NULL);
1545 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1550 length = strlen (value);
1551 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1553 value[length - 1] = '\0';
1554 attribute_add_value (attr, value + 1);
1559 _("Attribute value %s[%d] is not quoted: %s."),
1561 attribute_add_value (attr, value);
1564 /* Was this the last value for this attribute? */
1565 if (text_match (text, ')'))
1569 attrset_add (attrs, attr);
1571 attribute_destroy (attr);
1573 while (!text_match (text, '/'));
1576 /* Reads record type 7, subtype 17, which lists custom
1577 attributes on the data file. */
1579 read_data_file_attributes (struct sfm_reader *r,
1580 size_t size, size_t count,
1581 struct dictionary *dict)
1583 struct text_record *text = open_text_record (r, size * count);
1584 read_attributes (r, text, dict_get_attributes (dict));
1585 close_text_record (r, text);
1589 skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels)
1593 for (i = 0; i < n_labels; i++)
1595 size_t value_length, label_length;
1597 value_length = read_int (r);
1598 skip_bytes (r, value_length);
1599 label_length = read_int (r);
1600 skip_bytes (r, label_length);
1605 read_long_string_value_labels (struct sfm_reader *r,
1606 size_t size, size_t count,
1607 struct dictionary *d)
1609 const off_t start = ftello (r->file);
1610 while (ftello (r->file) - start < size * count)
1612 char var_name[VAR_NAME_LEN + 1];
1620 var_name_len = read_int (r);
1621 if (var_name_len > VAR_NAME_LEN)
1622 sys_error (r, _("Variable name length in long string value label "
1623 "record (%d) exceeds %d-byte limit."),
1624 var_name_len, VAR_NAME_LEN);
1625 read_string (r, var_name, var_name_len + 1);
1626 width = read_int (r);
1627 n_labels = read_int (r);
1629 v = dict_lookup_var (d, var_name);
1632 sys_warn (r, _("Ignoring long string value record for "
1633 "unknown variable %s."), var_name);
1634 skip_long_string_value_labels (r, n_labels);
1637 if (var_is_numeric (v))
1639 sys_warn (r, _("Ignoring long string value record for "
1640 "numeric variable %s."), var_name);
1641 skip_long_string_value_labels (r, n_labels);
1644 if (width != var_get_width (v))
1646 sys_warn (r, _("Ignoring long string value record for variable %s "
1647 "because the record's width (%d) does not match the "
1648 "variable's width (%d)."),
1649 var_name, width, var_get_width (v));
1650 skip_long_string_value_labels (r, n_labels);
1655 value_init_pool (r->pool, &value, width);
1656 for (i = 0; i < n_labels; i++)
1658 size_t value_length, label_length;
1663 value_length = read_int (r);
1664 if (value_length == width)
1665 read_bytes (r, value_str_rw (&value, width), width);
1668 sys_warn (r, _("Ignoring long string value %zu for variable %s, "
1669 "with width %d, that has bad value width %zu."),
1670 i, var_get_name (v), width, value_length);
1671 skip_bytes (r, value_length);
1676 label_length = read_int (r);
1677 read_string (r, label, MIN (sizeof label, label_length + 1));
1678 if (label_length >= sizeof label)
1680 /* Skip and silently ignore label text after the
1681 first 255 bytes. The maximum documented length
1682 of a label is 120 bytes so this is more than
1684 skip_bytes (r, (label_length + 1) - sizeof label);
1687 if (!skip && !var_add_value_label (v, &value, label))
1688 sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
1689 width, value_str (&value, width), var_get_name (v));
1695 /* Reads record type 7, subtype 18, which lists custom
1696 attributes on individual variables. */
1698 read_variable_attributes (struct sfm_reader *r,
1699 size_t size, size_t count,
1700 struct dictionary *dict)
1702 struct text_record *text = open_text_record (r, size * count);
1705 struct variable *var;
1706 if (!text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1708 read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1710 close_text_record (r, text);
1716 static void partial_record (struct sfm_reader *r)
1719 static void read_error (struct casereader *, const struct sfm_reader *);
1721 static bool read_case_number (struct sfm_reader *, double *);
1722 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1723 static int read_opcode (struct sfm_reader *);
1724 static bool read_compressed_number (struct sfm_reader *, double *);
1725 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1726 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1727 static bool skip_whole_strings (struct sfm_reader *, size_t);
1729 /* Reads and returns one case from READER's file. Returns a null
1730 pointer if not successful. */
1731 static struct ccase *
1732 sys_file_casereader_read (struct casereader *reader, void *r_)
1734 struct sfm_reader *r = r_;
1735 struct ccase *volatile c;
1741 c = case_create (r->proto);
1742 if (setjmp (r->bail_out))
1744 casereader_force_error (reader);
1749 for (i = 0; i < r->sfm_var_cnt; i++)
1751 struct sfm_var *sv = &r->sfm_vars[i];
1752 union value *v = case_data_rw_idx (c, sv->case_index);
1754 if (sv->var_width == 0)
1756 if (!read_case_number (r, &v->f))
1761 uint8_t *s = value_str_rw (v, sv->var_width);
1762 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1764 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1773 if (r->case_cnt != -1)
1774 read_error (reader, r);
1779 /* Issues an error that R ends in a partial record. */
1781 partial_record (struct sfm_reader *r)
1783 sys_error (r, _("File ends in partial case."));
1786 /* Issues an error that an unspecified error occurred SFM, and
1789 read_error (struct casereader *r, const struct sfm_reader *sfm)
1791 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1792 casereader_force_error (r);
1795 /* Reads a number from R and stores its value in *D.
1796 If R is compressed, reads a compressed number;
1797 otherwise, reads a number in the regular way.
1798 Returns true if successful, false if end of file is
1799 reached immediately. */
1801 read_case_number (struct sfm_reader *r, double *d)
1806 if (!try_read_bytes (r, number, sizeof number))
1808 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1812 return read_compressed_number (r, d);
1815 /* Reads LENGTH string bytes from R into S.
1816 Always reads a multiple of 8 bytes; if LENGTH is not a
1817 multiple of 8, then extra bytes are read and discarded without
1819 Reads compressed strings if S is compressed.
1820 Returns true if successful, false if end of file is
1821 reached immediately. */
1823 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
1825 size_t whole = ROUND_DOWN (length, 8);
1826 size_t partial = length % 8;
1830 if (!read_whole_strings (r, s, whole))
1837 if (!read_whole_strings (r, bounce, sizeof bounce))
1843 memcpy (s + whole, bounce, partial);
1849 /* Reads and returns the next compression opcode from R. */
1851 read_opcode (struct sfm_reader *r)
1853 assert (r->compressed);
1857 if (r->opcode_idx >= sizeof r->opcodes)
1859 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1863 opcode = r->opcodes[r->opcode_idx++];
1870 /* Reads a compressed number from R and stores its value in D.
1871 Returns true if successful, false if end of file is
1872 reached immediately. */
1874 read_compressed_number (struct sfm_reader *r, double *d)
1876 int opcode = read_opcode (r);
1884 *d = read_float (r);
1888 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
1889 if (!r->corruption_warning)
1891 r->corruption_warning = true;
1892 sys_warn (r, _("Possible compressed data corruption: "
1893 "compressed spaces appear in numeric field."));
1902 *d = opcode - r->bias;
1909 /* Reads a compressed 8-byte string segment from R and stores it
1911 Returns true if successful, false if end of file is
1912 reached immediately. */
1914 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
1916 int opcode = read_opcode (r);
1924 read_bytes (r, dst, 8);
1928 memset (dst, ' ', 8);
1933 double value = opcode - r->bias;
1934 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
1937 /* This has actually been seen "in the wild". The submitter of the
1938 file that showed that the contents decoded as spaces, but they
1939 were at the end of the field so it's possible that the null
1940 bytes just acted as null terminators. */
1942 else if (!r->corruption_warning)
1944 r->corruption_warning = true;
1945 sys_warn (r, _("Possible compressed data corruption: "
1946 "string contains compressed integer (opcode %d)."),
1956 /* Reads LENGTH string bytes from R into S.
1957 LENGTH must be a multiple of 8.
1958 Reads compressed strings if S is compressed.
1959 Returns true if successful, false if end of file is
1960 reached immediately. */
1962 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
1964 assert (length % 8 == 0);
1966 return try_read_bytes (r, s, length);
1970 for (ofs = 0; ofs < length; ofs += 8)
1971 if (!read_compressed_string (r, s + ofs))
1981 /* Skips LENGTH string bytes from R.
1982 LENGTH must be a multiple of 8.
1983 (LENGTH is also limited to 1024, but that's only because the
1984 current caller never needs more than that many bytes.)
1985 Returns true if successful, false if end of file is
1986 reached immediately. */
1988 skip_whole_strings (struct sfm_reader *r, size_t length)
1990 uint8_t buffer[1024];
1991 assert (length < sizeof buffer);
1992 return read_whole_strings (r, buffer, length);
1995 /* Creates and returns a table that can be used for translating a value
1996 index into a case to a "struct variable *" for DICT. Multiple
1997 system file fields reference variables this way.
1999 This table must be created before processing the very long
2000 string extension record, because that record causes some
2001 values to be deleted from the case and the dictionary to be
2003 static struct variable **
2004 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
2006 struct variable **var_by_value_idx;
2010 var_by_value_idx = pool_nmalloc (r->pool,
2011 r->oct_cnt, sizeof *var_by_value_idx);
2012 for (i = 0; i < dict_get_var_cnt (dict); i++)
2014 struct variable *v = dict_get_var (dict, i);
2015 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
2018 var_by_value_idx[value_idx++] = v;
2019 for (j = 1; j < nv; j++)
2020 var_by_value_idx[value_idx++] = NULL;
2022 assert (value_idx == r->oct_cnt);
2024 return var_by_value_idx;
2027 /* Returns the "struct variable" corresponding to the given
2028 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
2030 static struct variable *
2031 lookup_var_by_value_idx (struct sfm_reader *r,
2032 struct variable **var_by_value_idx, int value_idx)
2034 struct variable *var;
2036 if (value_idx < 1 || value_idx > r->oct_cnt)
2037 sys_error (r, _("Variable index %d not in valid range 1...%d."),
2038 value_idx, r->oct_cnt);
2040 var = var_by_value_idx[value_idx - 1];
2042 sys_error (r, _("Variable index %d refers to long string "
2049 /* Returns the variable in D with the given SHORT_NAME,
2050 or a null pointer if there is none. */
2051 static struct variable *
2052 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
2054 struct variable *var;
2058 /* First try looking up by full name. This often succeeds. */
2059 var = dict_lookup_var (d, short_name);
2060 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
2063 /* Iterate through the whole dictionary as a fallback. */
2064 var_cnt = dict_get_var_cnt (d);
2065 for (i = 0; i < var_cnt; i++)
2067 var = dict_get_var (d, i);
2068 if (!strcasecmp (var_get_short_name (var, 0), short_name))
2075 /* Helpers for reading records that contain structured text
2078 /* Maximum number of warnings to issue for a single text
2080 #define MAX_TEXT_WARNINGS 5
2085 struct substring buffer; /* Record contents. */
2086 size_t pos; /* Current position in buffer. */
2087 int n_warnings; /* Number of warnings issued or suppressed. */
2090 /* Reads SIZE bytes into a text record for R,
2091 and returns the new text record. */
2092 static struct text_record *
2093 open_text_record (struct sfm_reader *r, size_t size)
2095 struct text_record *text = pool_alloc (r->pool, sizeof *text);
2096 char *buffer = pool_malloc (r->pool, size + 1);
2097 read_bytes (r, buffer, size);
2098 text->buffer = ss_buffer (buffer, size);
2100 text->n_warnings = 0;
2104 /* Closes TEXT, frees its storage, and issues a final warning
2105 about suppressed warnings if necesary. */
2107 close_text_record (struct sfm_reader *r, struct text_record *text)
2109 if (text->n_warnings > MAX_TEXT_WARNINGS)
2110 sys_warn (r, _("Suppressed %d additional related warnings."),
2111 text->n_warnings - MAX_TEXT_WARNINGS);
2112 pool_free (r->pool, ss_data (text->buffer));
2115 /* Reads a variable=value pair from TEXT.
2116 Looks up the variable in DICT and stores it into *VAR.
2117 Stores a null-terminated value into *VALUE. */
2119 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2120 struct text_record *text,
2121 struct variable **var, char **value)
2125 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2128 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2132 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2133 ss_buffer ("\t\0", 2));
2141 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2142 struct text_record *text, struct substring delimiters,
2143 struct variable **var)
2147 name = text_get_token (text, delimiters, NULL);
2151 *var = dict_lookup_var (dict, name);
2155 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2162 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2163 struct text_record *text, struct substring delimiters,
2164 struct variable **var)
2166 char *short_name = text_get_token (text, delimiters, NULL);
2167 if (short_name == NULL)
2170 *var = lookup_var_by_short_name (dict, short_name);
2172 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2177 /* Displays a warning for the current file position, limiting the
2178 number to MAX_TEXT_WARNINGS for TEXT. */
2180 text_warn (struct sfm_reader *r, struct text_record *text,
2181 const char *format, ...)
2183 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2187 va_start (args, format);
2188 sys_msg (r, MW, format, args);
2194 text_get_token (struct text_record *text, struct substring delimiters,
2197 struct substring token;
2200 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2203 end = &ss_data (token)[ss_length (token)];
2204 if (delimiter != NULL)
2207 return ss_data (token);
2210 /* Reads a integer value expressed in decimal, then a space, then a string that
2211 consists of exactly as many bytes as specified by the integer, then a space,
2212 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2213 buffer (so the caller should not free the string). */
2215 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2225 int c = text->buffer.string[text->pos];
2226 if (c < '0' || c > '9')
2228 n = (n * 10) + (c - '0');
2231 if (start == text->pos)
2233 sys_warn (r, _("Expecting digit at offset %zu in MRSETS record."),
2238 if (!text_match (text, ' '))
2240 sys_warn (r, _("Expecting space at offset %zu in MRSETS record."),
2245 if (text->pos + n > text->buffer.length)
2247 sys_warn (r, _("%zu-byte string starting at offset %zu "
2248 "exceeds record length %zu."),
2249 n, text->pos, text->buffer.length);
2253 s = &text->buffer.string[text->pos];
2257 _("Expecting space at offset %zu following %zu-byte string."),
2267 text_match (struct text_record *text, char c)
2269 if (text->buffer.string[text->pos] == c)
2278 /* Returns the current byte offset inside the TEXT's string. */
2280 text_pos (const struct text_record *text)
2287 /* Displays a corruption message. */
2289 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
2294 ds_init_empty (&text);
2295 ds_put_format (&text, "`%s' near offset 0x%llx: ",
2296 fh_get_file_name (r->fh), (long long int) ftello (r->file));
2297 ds_put_vformat (&text, format, args);
2299 m.category = msg_class_to_category (class);
2300 m.severity = msg_class_to_severity (class);
2301 m.where.file_name = NULL;
2302 m.where.line_number = 0;
2303 m.where.first_column = 0;
2304 m.where.last_column = 0;
2305 m.text = ds_cstr (&text);
2310 /* Displays a warning for the current file position. */
2312 sys_warn (struct sfm_reader *r, const char *format, ...)
2316 va_start (args, format);
2317 sys_msg (r, MW, format, args);
2321 /* Displays an error for the current file position,
2322 marks it as in an error state,
2323 and aborts reading it using longjmp. */
2325 sys_error (struct sfm_reader *r, const char *format, ...)
2329 va_start (args, format);
2330 sys_msg (r, ME, format, args);
2334 longjmp (r->bail_out, 1);
2337 /* Reads BYTE_CNT bytes into BUF.
2338 Returns true if exactly BYTE_CNT bytes are successfully read.
2339 Aborts if an I/O error or a partial read occurs.
2340 If EOF_IS_OK, then an immediate end-of-file causes false to be
2341 returned; otherwise, immediate end-of-file causes an abort
2344 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2345 void *buf, size_t byte_cnt)
2347 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2348 if (bytes_read == byte_cnt)
2350 else if (ferror (r->file))
2351 sys_error (r, _("System error: %s."), strerror (errno));
2352 else if (!eof_is_ok || bytes_read != 0)
2353 sys_error (r, _("Unexpected end of file."));
2358 /* Reads BYTE_CNT into BUF.
2359 Aborts upon I/O error or if end-of-file is encountered. */
2361 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2363 read_bytes_internal (r, false, buf, byte_cnt);
2366 /* Reads BYTE_CNT bytes into BUF.
2367 Returns true if exactly BYTE_CNT bytes are successfully read.
2368 Returns false if an immediate end-of-file is encountered.
2369 Aborts if an I/O error or a partial read occurs. */
2371 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2373 return read_bytes_internal (r, true, buf, byte_cnt);
2376 /* Reads a 32-bit signed integer from R and returns its value in
2379 read_int (struct sfm_reader *r)
2382 read_bytes (r, integer, sizeof integer);
2383 return integer_get (r->integer_format, integer, sizeof integer);
2386 /* Reads a 64-bit floating-point number from R and returns its
2387 value in host format. */
2389 read_float (struct sfm_reader *r)
2392 read_bytes (r, number, sizeof number);
2393 return float_get_double (r->float_format, number);
2396 /* Reads exactly SIZE - 1 bytes into BUFFER
2397 and stores a null byte into BUFFER[SIZE - 1]. */
2399 read_string (struct sfm_reader *r, char *buffer, size_t size)
2402 read_bytes (r, buffer, size - 1);
2403 buffer[size - 1] = '\0';
2406 /* Skips BYTES bytes forward in R. */
2408 skip_bytes (struct sfm_reader *r, size_t bytes)
2413 size_t chunk = MIN (sizeof buffer, bytes);
2414 read_bytes (r, buffer, chunk);
2419 static const struct casereader_class sys_file_casereader_class =
2421 sys_file_casereader_read,
2422 sys_file_casereader_destroy,