1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/mrset.h"
38 #include "data/short-names.h"
39 #include "data/value-labels.h"
40 #include "data/value.h"
41 #include "data/variable.h"
42 #include "libpspp/array.h"
43 #include "libpspp/assertion.h"
44 #include "libpspp/compiler.h"
45 #include "libpspp/i18n.h"
46 #include "libpspp/message.h"
47 #include "libpspp/misc.h"
48 #include "libpspp/pool.h"
49 #include "libpspp/str.h"
50 #include "libpspp/stringi-set.h"
52 #include "gl/c-ctype.h"
53 #include "gl/inttostr.h"
54 #include "gl/minmax.h"
55 #include "gl/unlocked-io.h"
56 #include "gl/xalloc.h"
60 #define _(msgid) gettext (msgid)
61 #define N_(msgid) (msgid)
63 /* System file reader. */
66 /* Resource tracking. */
67 struct pool *pool; /* All system file state. */
68 jmp_buf bail_out; /* longjmp() target for error handling. */
71 struct file_handle *fh; /* File handle. */
72 struct fh_lock *lock; /* Mutual exclusion for file handle. */
73 FILE *file; /* File stream. */
74 bool error; /* I/O or corruption error? */
75 struct caseproto *proto; /* Format of output cases. */
78 enum integer_format integer_format; /* On-disk integer format. */
79 enum float_format float_format; /* On-disk floating point format. */
80 int oct_cnt; /* Number of 8-byte units per case. */
81 struct sfm_var *sfm_vars; /* Variables. */
82 size_t sfm_var_cnt; /* Number of variables. */
83 casenumber case_cnt; /* Number of cases */
84 bool has_long_var_names; /* File has a long variable name map */
87 bool compressed; /* File is compressed? */
88 double bias; /* Compression bias, usually 100.0. */
89 uint8_t opcodes[8]; /* Current block of opcodes. */
90 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 bool corruption_warning; /* Warned about possible corruption? */
94 static const struct casereader_class sys_file_casereader_class;
96 static bool close_reader (struct sfm_reader *);
98 static struct variable **make_var_by_value_idx (struct sfm_reader *,
100 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
103 static struct variable *lookup_var_by_short_name (struct dictionary *,
104 const char *short_name);
106 static void sys_msg (struct sfm_reader *r, int class,
107 const char *format, va_list args)
108 PRINTF_FORMAT (3, 0);
109 static void sys_warn (struct sfm_reader *, const char *, ...)
110 PRINTF_FORMAT (2, 3);
111 static void sys_error (struct sfm_reader *, const char *, ...)
115 static void read_bytes (struct sfm_reader *, void *, size_t);
116 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
117 static int read_int (struct sfm_reader *);
118 static double read_float (struct sfm_reader *);
119 static void read_string (struct sfm_reader *, char *, size_t);
120 static void skip_bytes (struct sfm_reader *, size_t);
122 static struct text_record *open_text_record (struct sfm_reader *, size_t size);
123 static void close_text_record (struct sfm_reader *r,
124 struct text_record *);
125 static bool read_variable_to_value_pair (struct sfm_reader *,
127 struct text_record *,
128 struct variable **var, char **value);
129 static void text_warn (struct sfm_reader *r, struct text_record *text,
130 const char *format, ...)
131 PRINTF_FORMAT (3, 4);
132 static char *text_get_token (struct text_record *,
133 struct substring delimiters, char *delimiter);
134 static bool text_match (struct text_record *, char c);
135 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
136 struct text_record *,
137 struct substring delimiters,
139 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
140 struct text_record *,
141 struct substring delimiters,
143 static const char *text_parse_counted_string (struct sfm_reader *,
144 struct text_record *);
145 static size_t text_pos (const struct text_record *);
147 static bool close_reader (struct sfm_reader *r);
149 /* Dictionary reader. */
157 static void read_header (struct sfm_reader *, struct dictionary *,
158 int *weight_idx, int *claimed_oct_cnt,
159 struct sfm_read_info *);
160 static void read_variable_record (struct sfm_reader *, struct dictionary *,
161 int *format_warning_cnt);
162 static void parse_format_spec (struct sfm_reader *, unsigned int,
163 enum which_format, struct variable *,
164 int *format_warning_cnt);
165 static void setup_weight (struct sfm_reader *, int weight_idx,
166 struct variable **var_by_value_idx,
167 struct dictionary *);
168 static void read_documents (struct sfm_reader *, struct dictionary *);
169 static void read_value_labels (struct sfm_reader *, struct dictionary *,
170 struct variable **var_by_value_idx);
172 static void read_extension_record (struct sfm_reader *, struct dictionary *,
173 struct sfm_read_info *);
174 static void read_machine_integer_info (struct sfm_reader *,
175 size_t size, size_t count,
176 struct sfm_read_info *,
179 static void read_machine_float_info (struct sfm_reader *,
180 size_t size, size_t count);
181 static void read_mrsets (struct sfm_reader *, size_t size, size_t count,
182 struct dictionary *);
183 static void read_display_parameters (struct sfm_reader *,
184 size_t size, size_t count,
185 struct dictionary *);
186 static void read_long_var_name_map (struct sfm_reader *,
187 size_t size, size_t count,
188 struct dictionary *);
189 static void read_long_string_map (struct sfm_reader *,
190 size_t size, size_t count,
191 struct dictionary *);
192 static void read_data_file_attributes (struct sfm_reader *,
193 size_t size, size_t count,
194 struct dictionary *);
195 static void read_variable_attributes (struct sfm_reader *,
196 size_t size, size_t count,
197 struct dictionary *);
198 static void read_long_string_value_labels (struct sfm_reader *,
199 size_t size, size_t count,
200 struct dictionary *);
202 /* Convert all the strings in DICT from the dict encoding to UTF8 */
204 recode_strings (struct dictionary *dict)
208 const char *enc = dict_get_encoding (dict);
211 enc = get_default_encoding ();
213 for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
215 /* Convert the long variable name */
216 struct variable *var = dict_get_var (dict, i);
217 const char *native_name = var_get_name (var);
218 char *utf8_name = recode_string (UTF8, enc, native_name, -1);
219 if ( 0 != strcmp (utf8_name, native_name))
221 if ( NULL == dict_lookup_var (dict, utf8_name))
222 dict_rename_var (dict, var, utf8_name);
225 _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name);
230 /* Convert the variable label */
231 if (var_has_label (var))
233 char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
234 var_set_label (var, utf8_label);
238 if (var_has_value_labels (var))
240 const struct val_lab *vl = NULL;
241 const struct val_labs *vlabs = var_get_value_labels (var);
243 for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl))
245 const union value *val = val_lab_get_value (vl);
246 const char *label = val_lab_get_label (vl);
247 char *new_label = NULL;
249 new_label = recode_string (UTF8, enc, label, -1);
251 var_replace_value_label (var, val, new_label);
258 /* Opens the system file designated by file handle FH for
259 reading. Reads the system file's dictionary into *DICT.
260 If INFO is non-null, then it receives additional info about the
263 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
264 struct sfm_read_info *volatile info)
266 struct sfm_reader *volatile r = NULL;
267 struct variable **var_by_value_idx;
268 struct sfm_read_info local_info;
269 int format_warning_cnt = 0;
274 *dict = dict_create ();
276 /* Create and initialize reader. */
277 r = pool_create_container (struct sfm_reader, pool);
283 r->has_long_var_names = false;
284 r->opcode_idx = sizeof r->opcodes;
285 r->corruption_warning = false;
287 /* TRANSLATORS: this fragment will be interpolated into
288 messages in fh_lock() that identify types of files. */
289 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
293 r->file = fn_open (fh_get_file_name (fh), "rb");
296 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
297 fh_get_file_name (r->fh), strerror (errno));
301 /* Initialize info. */
304 memset (info, 0, sizeof *info);
306 if (setjmp (r->bail_out))
311 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
313 /* Read all the variable definition records. */
314 rec_type = read_int (r);
315 while (rec_type == 2)
317 read_variable_record (r, *dict, &format_warning_cnt);
318 rec_type = read_int (r);
321 /* Figure out the case format. */
322 var_by_value_idx = make_var_by_value_idx (r, *dict);
323 setup_weight (r, weight_idx, var_by_value_idx, *dict);
325 /* Read all the rest of the dictionary records. */
326 while (rec_type != 999)
331 read_value_labels (r, *dict, var_by_value_idx);
335 sys_error (r, _("Misplaced type 4 record."));
338 read_documents (r, *dict);
342 read_extension_record (r, *dict, info);
346 sys_error (r, _("Unrecognized record type %d."), rec_type);
348 rec_type = read_int (r);
352 if ( ! r->has_long_var_names )
355 for (i = 0; i < dict_get_var_cnt (*dict); i++)
357 struct variable *var = dict_get_var (*dict, i);
358 char short_name[SHORT_NAME_LEN + 1];
359 char long_name[SHORT_NAME_LEN + 1];
361 strcpy (short_name, var_get_name (var));
363 strcpy (long_name, short_name);
364 str_lowercase (long_name);
366 /* Set long name. Renaming a variable may clear the short
367 name, but we want to retain it, so re-set it
369 dict_rename_var (*dict, var, long_name);
370 var_set_short_name (var, 0, short_name);
373 r->has_long_var_names = true;
376 recode_strings (*dict);
378 /* Read record 999 data, which is just filler. */
381 /* Warn if the actual amount of data per case differs from the
382 amount that the header claims. SPSS version 13 gets this
383 wrong when very long strings are involved, so don't warn in
385 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
386 && info->version_major != 13)
387 sys_warn (r, _("File header claims %d variable positions but "
388 "%d were read from file."),
389 claimed_oct_cnt, r->oct_cnt);
391 /* Create an index of dictionary variable widths for
392 sfm_read_case to use. We cannot use the `struct variable's
393 from the dictionary we created, because the caller owns the
394 dictionary and may destroy or modify its variables. */
395 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
396 pool_register (r->pool, free, r->sfm_vars);
397 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
399 pool_free (r->pool, var_by_value_idx);
400 return casereader_create_sequential
402 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
403 &sys_file_casereader_class, r);
407 dict_destroy (*dict);
412 /* Closes a system file after we're done with it.
413 Returns true if an I/O error has occurred on READER, false
416 close_reader (struct sfm_reader *r)
425 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
427 msg (ME, _("Error closing system file `%s': %s."),
428 fh_get_file_name (r->fh), strerror (errno));
438 pool_destroy (r->pool);
443 /* Destroys READER. */
445 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
447 struct sfm_reader *r = r_;
451 /* Returns true if FILE is an SPSS system file,
454 sfm_detect (FILE *file)
458 if (fread (rec_type, 4, 1, file) != 1)
462 return !strcmp ("$FL2", rec_type);
465 /* Reads the global header of the system file.
466 Sets DICT's file label to the system file's label.
467 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
468 or to the value index of the weight variable otherwise.
469 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
470 per case that the file claims to have (although it is not
472 Initializes INFO with header information. */
474 read_header (struct sfm_reader *r, struct dictionary *dict,
475 int *weight_idx, int *claimed_oct_cnt,
476 struct sfm_read_info *info)
479 char eye_catcher[61];
480 uint8_t raw_layout_code[4];
482 char creation_date[10];
483 char creation_time[9];
485 struct substring file_label_ss;
486 struct substring product;
488 read_string (r, rec_type, sizeof rec_type);
489 read_string (r, eye_catcher, sizeof eye_catcher);
491 if (strcmp ("$FL2", rec_type) != 0)
492 sys_error (r, _("This is not an SPSS system file."));
494 /* Identify integer format. */
495 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
496 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
498 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
500 || (r->integer_format != INTEGER_MSB_FIRST
501 && r->integer_format != INTEGER_LSB_FIRST))
502 sys_error (r, _("This is not an SPSS system file."));
504 *claimed_oct_cnt = read_int (r);
505 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
506 *claimed_oct_cnt = -1;
508 r->compressed = read_int (r) != 0;
510 *weight_idx = read_int (r);
512 r->case_cnt = read_int (r);
513 if ( r->case_cnt > INT_MAX / 2)
516 /* Identify floating-point format and obtain compression bias. */
517 read_bytes (r, raw_bias, sizeof raw_bias);
518 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
520 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
522 if (memcmp (raw_bias, zero_bias, 8))
523 sys_warn (r, _("Compression bias is not the usual "
524 "value of 100, or system file uses unrecognized "
525 "floating-point format."));
528 /* Some software is known to write all-zeros to this
529 field. Such software also writes floating-point
530 numbers in the format that we expect by default
531 (it seems that all software most likely does, in
532 reality), so don't warn in this case. */
535 if (r->integer_format == INTEGER_MSB_FIRST)
536 r->float_format = FLOAT_IEEE_DOUBLE_BE;
538 r->float_format = FLOAT_IEEE_DOUBLE_LE;
540 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
542 read_string (r, creation_date, sizeof creation_date);
543 read_string (r, creation_time, sizeof creation_time);
544 read_string (r, file_label, sizeof file_label);
547 file_label_ss = ss_cstr (file_label);
548 ss_trim (&file_label_ss, ss_cstr (" "));
549 if (!ss_is_empty (file_label_ss))
551 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
552 dict_set_label (dict, ss_data (file_label_ss));
555 strcpy (info->creation_date, creation_date);
556 strcpy (info->creation_time, creation_time);
557 info->integer_format = r->integer_format;
558 info->float_format = r->float_format;
559 info->compressed = r->compressed;
560 info->case_cnt = r->case_cnt;
562 product = ss_cstr (eye_catcher);
563 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
564 ss_trim (&product, ss_cstr (" "));
565 str_copy_buf_trunc (info->product, sizeof info->product,
566 ss_data (product), ss_length (product));
569 /* Reads a variable (type 2) record from R and adds the
570 corresponding variable to DICT.
571 Also skips past additional variable records for long string
574 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
575 int *format_warning_cnt)
578 int has_variable_label;
579 int missing_value_code;
584 struct variable *var;
587 width = read_int (r);
588 has_variable_label = read_int (r);
589 missing_value_code = read_int (r);
590 print_format = read_int (r);
591 write_format = read_int (r);
592 read_string (r, name, sizeof name);
593 name[strcspn (name, " ")] = '\0';
595 /* Check variable name. */
596 if (name[0] == '$' || name[0] == '#')
597 sys_error (r, _("Variable name begins with invalid character `%c'."),
599 if (!var_is_plausible_name (name, false))
600 sys_error (r, _("Invalid variable name `%s'."), name);
602 /* Create variable. */
603 if (width < 0 || width > 255)
604 sys_error (r, _("Bad width %d for variable %s."), width, name);
605 var = dict_create_var (dict, name, width);
608 _("Duplicate variable name `%s' within system file."),
611 /* Set the short name the same as the long name. */
612 var_set_short_name (var, 0, var_get_name (var));
614 /* Get variable label, if any. */
615 if (has_variable_label != 0 && has_variable_label != 1)
616 sys_error (r, _("Variable label indicator field is not 0 or 1."));
617 if (has_variable_label == 1)
619 size_t len, read_len;
624 /* Read up to 255 bytes of label. */
625 read_len = MIN (sizeof label - 1, len);
626 read_string (r, label, read_len + 1);
627 var_set_label (var, label);
629 /* Skip unread label bytes. */
630 skip_bytes (r, len - read_len);
632 /* Skip label padding up to multiple of 4 bytes. */
633 skip_bytes (r, ROUND_UP (len, 4) - len);
636 /* Set missing values. */
637 if (missing_value_code != 0)
639 struct missing_values mv;
642 mv_init_pool (r->pool, &mv, var_get_width (var));
643 if (var_is_numeric (var))
645 if (missing_value_code < -3 || missing_value_code > 3
646 || missing_value_code == -1)
647 sys_error (r, _("Numeric missing value indicator field is not "
648 "-3, -2, 0, 1, 2, or 3."));
649 if (missing_value_code < 0)
651 double low = read_float (r);
652 double high = read_float (r);
653 mv_add_range (&mv, low, high);
654 missing_value_code = -missing_value_code - 2;
656 for (i = 0; i < missing_value_code; i++)
657 mv_add_num (&mv, read_float (r));
661 int mv_width = MAX (width, 8);
664 if (missing_value_code < 1 || missing_value_code > 3)
665 sys_error (r, _("String missing value indicator field is not "
668 value_init (&value, mv_width);
669 value_set_missing (&value, mv_width);
670 for (i = 0; i < missing_value_code; i++)
672 uint8_t *s = value_str_rw (&value, mv_width);
673 read_bytes (r, s, 8);
676 value_destroy (&value, mv_width);
678 var_set_missing_values (var, &mv);
682 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
683 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
685 /* Account for values.
686 Skip long string continuation records, if any. */
687 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
693 for (i = 1; i < nv; i++)
695 /* Check for record type 2 and width -1. */
696 if (read_int (r) != 2 || read_int (r) != -1)
697 sys_error (r, _("Missing string continuation record."));
699 /* Skip and ignore remaining continuation data. */
700 has_variable_label = read_int (r);
701 missing_value_code = read_int (r);
702 print_format = read_int (r);
703 write_format = read_int (r);
704 read_string (r, name, sizeof name);
706 /* Variable label fields on continuation records have
707 been spotted in system files created by "SPSS Power
708 Macintosh Release 6.1". */
709 if (has_variable_label)
710 skip_bytes (r, ROUND_UP (read_int (r), 4));
715 /* Translates the format spec from sysfile format to internal
718 parse_format_spec (struct sfm_reader *r, unsigned int s,
719 enum which_format which, struct variable *v,
720 int *format_warning_cnt)
722 const int max_format_warnings = 8;
724 uint8_t raw_type = s >> 16;
730 if (!fmt_from_io (raw_type, &f.type))
731 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
736 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
741 if (which == PRINT_FORMAT)
742 var_set_print_format (v, &f);
744 var_set_write_format (v, &f);
746 else if (++*format_warning_cnt <= max_format_warnings)
748 char fmt_string[FMT_STRING_LEN_MAX + 1];
749 sys_warn (r, _("%s variable %s has invalid %s format %s."),
750 var_is_numeric (v) ? _("Numeric") : _("String"),
752 which == PRINT_FORMAT ? _("print") : _("write"),
753 fmt_to_string (&f, fmt_string));
755 if (*format_warning_cnt == max_format_warnings)
756 sys_warn (r, _("Suppressing further invalid format warnings."));
760 /* Sets the weighting variable in DICT to the variable
761 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
764 setup_weight (struct sfm_reader *r, int weight_idx,
765 struct variable **var_by_value_idx, struct dictionary *dict)
769 struct variable *weight_var
770 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
771 if (var_is_numeric (weight_var))
772 dict_set_weight (dict, weight_var);
774 sys_error (r, _("Weighting variable must be numeric "
775 "(not string variable `%s')."),
776 var_get_name (weight_var));
780 /* Reads a document record, type 6, from system file R, and sets up
781 the documents and n_documents fields in the associated
784 read_documents (struct sfm_reader *r, struct dictionary *dict)
789 if (dict_get_documents (dict) != NULL)
790 sys_error (r, _("Multiple type 6 (document) records."));
792 line_cnt = read_int (r);
794 sys_error (r, _("Number of document lines (%d) "
795 "must be greater than 0."), line_cnt);
797 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
798 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
799 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
800 dict_set_documents (dict, documents);
802 sys_error (r, _("Document line contains null byte."));
803 pool_free (r->pool, documents);
806 /* Read a type 7 extension record. */
808 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
809 struct sfm_read_info *info)
811 int subtype = read_int (r);
812 size_t size = read_int (r);
813 size_t count = read_int (r);
814 size_t bytes = size * count;
816 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
817 allows an extra byte for a null terminator, used by some
818 extension processing routines. */
819 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
820 sys_error (r, "Record type 7 subtype %d too large.", subtype);
825 read_machine_integer_info (r, size, count, info, dict);
829 read_machine_float_info (r, size, count);
833 /* Variable sets information. We don't use these yet.
834 They only apply to GUIs; see VARSETS on the APPLY
835 DICTIONARY command in SPSS documentation. */
839 /* DATE variable information. We don't use it yet, but we
845 read_mrsets (r, size, count, dict);
849 /* Used by the SPSS Data Entry software. */
853 read_display_parameters (r, size, count, dict);
857 read_long_var_name_map (r, size, count, dict);
861 read_long_string_map (r, size, count, dict);
865 /* Extended number of cases. Not important. */
869 read_data_file_attributes (r, size, count, dict);
873 read_variable_attributes (r, size, count, dict);
877 /* New in SPSS 16. Contains a single string that describes
878 the character encoding, e.g. "windows-1252". */
880 char *encoding = pool_calloc (r->pool, size, count + 1);
881 read_string (r, encoding, count + 1);
882 dict_set_encoding (dict, encoding);
887 /* New in SPSS 16. Encodes value labels for long string
889 read_long_string_value_labels (r, size, count, dict);
893 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send "
894 "a copy of this file, and the syntax which created it "
896 subtype, PACKAGE_BUGREPORT);
900 skip_bytes (r, bytes);
903 /* Read record type 7, subtype 3. */
905 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
906 struct sfm_read_info *info,
907 struct dictionary *dict)
909 int version_major = read_int (r);
910 int version_minor = read_int (r);
911 int version_revision = read_int (r);
912 int machine_code UNUSED = read_int (r);
913 int float_representation = read_int (r);
914 int compression_code UNUSED = read_int (r);
915 int integer_representation = read_int (r);
916 int character_code = read_int (r);
918 int expected_float_format;
919 int expected_integer_format;
921 if (size != 4 || count != 8)
922 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
926 /* Save version info. */
927 info->version_major = version_major;
928 info->version_minor = version_minor;
929 info->version_revision = version_revision;
931 /* Check floating point format. */
932 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
933 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
934 expected_float_format = 1;
935 else if (r->float_format == FLOAT_Z_LONG)
936 expected_float_format = 2;
937 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
938 expected_float_format = 3;
941 if (float_representation != expected_float_format)
942 sys_error (r, _("Floating-point representation indicated by "
943 "system file (%d) differs from expected (%d)."),
944 float_representation, expected_float_format);
946 /* Check integer format. */
947 if (r->integer_format == INTEGER_MSB_FIRST)
948 expected_integer_format = 1;
949 else if (r->integer_format == INTEGER_LSB_FIRST)
950 expected_integer_format = 2;
953 if (integer_representation != expected_integer_format)
954 sys_warn (r, _("Integer format indicated by system file (%d) "
955 "differs from expected (%d)."),
956 integer_representation, expected_integer_format);
959 Record 7 (20) provides a much more reliable way of
960 setting the encoding.
961 The character_code is used as a fallback only.
963 if ( NULL == dict_get_encoding (dict))
965 switch (character_code)
968 dict_set_encoding (dict, "EBCDIC-US");
972 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
973 respectively. However, there are known to be many files
974 in the wild with character code 2, yet have data which are
976 Therefore we ignore these values.
980 dict_set_encoding (dict, "MS_KANJI");
983 dict_set_encoding (dict, "UTF-7");
986 dict_set_encoding (dict, "UTF-8");
991 snprintf (enc, 100, "CP%d", character_code);
992 dict_set_encoding (dict, enc);
999 /* Read record type 7, subtype 4. */
1001 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
1003 double sysmis = read_float (r);
1004 double highest = read_float (r);
1005 double lowest = read_float (r);
1007 if (size != 8 || count != 3)
1008 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
1011 if (sysmis != SYSMIS)
1012 sys_warn (r, _("File specifies unexpected value %g as %s."),
1015 if (highest != HIGHEST)
1016 sys_warn (r, _("File specifies unexpected value %g as %s."),
1017 highest, "HIGHEST");
1019 if (lowest != LOWEST)
1020 sys_warn (r, _("File specifies unexpected value %g as %s."),
1024 /* Read record type 7, subtype 7 or 19. */
1026 read_mrsets (struct sfm_reader *r, size_t size, size_t count,
1027 struct dictionary *dict)
1029 struct text_record *text;
1030 struct mrset *mrset;
1032 text = open_text_record (r, size * count);
1035 const char *name, *label, *counted;
1036 struct stringi_set var_names;
1037 size_t allocated_vars;
1041 mrset = xzalloc (sizeof *mrset);
1043 name = text_get_token (text, ss_cstr ("="), NULL);
1046 mrset->name = xstrdup (name);
1048 if (mrset->name[0] != '$')
1050 sys_warn (r, _("`%s' does not begin with `$' at offset %zu "
1051 "in MRSETS record."), mrset->name, text_pos (text));
1055 if (text_match (text, 'C'))
1057 mrset->type = MRSET_MC;
1058 if (!text_match (text, ' '))
1060 sys_warn (r, _("Missing space following `%c' at offset %zu "
1061 "in MRSETS record."), 'C', text_pos (text));
1065 else if (text_match (text, 'D'))
1067 mrset->type = MRSET_MD;
1068 mrset->cat_source = MRSET_VARLABELS;
1070 else if (text_match (text, 'E'))
1074 mrset->type = MRSET_MD;
1075 mrset->cat_source = MRSET_COUNTEDVALUES;
1076 if (!text_match (text, ' '))
1078 sys_warn (r, _("Missing space following `%c' at offset %zu "
1079 "in MRSETS record."), 'E', text_pos (text));
1083 number = text_get_token (text, ss_cstr (" "), NULL);
1084 if (!strcmp (number, "11"))
1085 mrset->label_from_var_label = true;
1086 else if (strcmp (number, "1"))
1087 sys_warn (r, _("Unexpected label source value `%s' "
1088 "following `E' at offset %zu in MRSETS record."),
1089 number, text_pos (text));
1093 sys_warn (r, _("Missing `C', `D', or `E' at offset %zu "
1094 "in MRSETS record."),
1099 if (mrset->type == MRSET_MD)
1101 counted = text_parse_counted_string (r, text);
1102 if (counted == NULL)
1106 label = text_parse_counted_string (r, text);
1109 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1111 stringi_set_init (&var_names);
1116 struct variable *var;
1117 const char *var_name;
1119 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1120 if (var_name == NULL)
1122 sys_warn (r, _("Missing new-line parsing variable names "
1123 "at offset %zu in MRSETS record."),
1128 var = lookup_var_by_short_name (dict, var_name);
1131 if (!stringi_set_insert (&var_names, var_name))
1133 sys_warn (r, _("Duplicate variable name %s "
1134 "at offset %zu in MRSETS record."),
1135 var_name, text_pos (text));
1139 if (mrset->label == NULL && mrset->label_from_var_label
1140 && var_has_label (var))
1141 mrset->label = xstrdup (var_get_label (var));
1144 && var_get_type (var) != var_get_type (mrset->vars[0]))
1146 sys_warn (r, _("MRSET %s contains both string and "
1147 "numeric variables."), name);
1150 width = MIN (width, var_get_width (var));
1152 if (mrset->n_vars >= allocated_vars)
1153 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1154 sizeof *mrset->vars);
1155 mrset->vars[mrset->n_vars++] = var;
1157 while (delimiter != '\n');
1159 if (mrset->n_vars < 2)
1161 sys_warn (r, _("MRSET %s has only %zu variables."), mrset->name,
1163 mrset_destroy (mrset);
1167 if (mrset->type == MRSET_MD)
1169 mrset->width = width;
1170 value_init (&mrset->counted, width);
1172 mrset->counted.f = strtod (counted, NULL);
1174 value_copy_str_rpad (&mrset->counted, width,
1175 (const uint8_t *) counted, ' ');
1178 dict_add_mrset (dict, mrset);
1180 stringi_set_destroy (&var_names);
1182 mrset_destroy (mrset);
1183 close_text_record (r, text);
1186 /* Read record type 7, subtype 11, which specifies how variables
1187 should be displayed in GUI environments. */
1189 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
1190 struct dictionary *dict)
1193 bool includes_width;
1194 bool warned = false;
1199 sys_warn (r, _("Bad size %zu on extension 11."), size);
1200 skip_bytes (r, size * count);
1204 n_vars = dict_get_var_cnt (dict);
1205 if (count == 3 * n_vars)
1206 includes_width = true;
1207 else if (count == 2 * n_vars)
1208 includes_width = false;
1211 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
1213 skip_bytes (r, size * count);
1217 for (i = 0; i < n_vars; ++i)
1219 struct variable *v = dict_get_var (dict, i);
1220 int measure = read_int (r);
1221 int width = includes_width ? read_int (r) : 0;
1222 int align = read_int (r);
1224 /* SPSS 14 sometimes seems to set string variables' measure
1226 if (0 == measure && var_is_alpha (v))
1229 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1232 sys_warn (r, _("Invalid variable display parameters "
1233 "for variable %zu (%s). "
1234 "Default parameters substituted."),
1235 i, var_get_name (v));
1240 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1241 : measure == 2 ? MEASURE_ORDINAL
1243 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1244 : align == 1 ? ALIGN_RIGHT
1247 /* Older versions (SPSS 9.0) sometimes set the display
1248 width to zero. This causes confusion in the GUI, so
1249 only set the width if it is nonzero. */
1251 var_set_display_width (v, width);
1255 /* Reads record type 7, subtype 13, which gives the long name
1256 that corresponds to each short name. Modifies variable names
1257 in DICT accordingly. */
1259 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
1260 struct dictionary *dict)
1262 struct text_record *text;
1263 struct variable *var;
1266 text = open_text_record (r, size * count);
1267 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1270 size_t short_name_cnt;
1273 /* Validate long name. */
1274 if (!var_is_valid_name (long_name, false))
1276 sys_warn (r, _("Long variable mapping from %s to invalid "
1277 "variable name `%s'."),
1278 var_get_name (var), long_name);
1282 /* Identify any duplicates. */
1283 if (strcasecmp (var_get_short_name (var, 0), long_name)
1284 && dict_lookup_var (dict, long_name) != NULL)
1286 sys_warn (r, _("Duplicate long variable name `%s' "
1287 "within system file."), long_name);
1291 /* Renaming a variable may clear its short names, but we
1292 want to retain them, so we save them and re-set them
1294 short_name_cnt = var_get_short_name_cnt (var);
1295 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
1296 for (i = 0; i < short_name_cnt; i++)
1298 const char *s = var_get_short_name (var, i);
1299 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1302 /* Set long name. */
1303 dict_rename_var (dict, var, long_name);
1305 /* Restore short names. */
1306 for (i = 0; i < short_name_cnt; i++)
1308 var_set_short_name (var, i, short_names[i]);
1309 free (short_names[i]);
1313 close_text_record (r, text);
1314 r->has_long_var_names = true;
1317 /* Reads record type 7, subtype 14, which gives the real length
1318 of each very long string. Rearranges DICT accordingly. */
1320 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1321 struct dictionary *dict)
1323 struct text_record *text;
1324 struct variable *var;
1327 text = open_text_record (r, size * count);
1328 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1330 size_t idx = var_get_dict_index (var);
1336 length = strtol (length_s, NULL, 10);
1337 if (length < 1 || length > MAX_STRING)
1339 sys_warn (r, _("%s listed as string of invalid length %s "
1340 "in very long string record."),
1341 var_get_name (var), length_s);
1345 /* Check segments. */
1346 segment_cnt = sfm_width_to_segments (length);
1347 if (segment_cnt == 1)
1349 sys_warn (r, _("%s listed in very long string record with width %s, "
1350 "which requires only one segment."),
1351 var_get_name (var), length_s);
1354 if (idx + segment_cnt > dict_get_var_cnt (dict))
1355 sys_error (r, _("Very long string %s overflows dictionary."),
1356 var_get_name (var));
1358 /* Get the short names from the segments and check their
1360 for (i = 0; i < segment_cnt; i++)
1362 struct variable *seg = dict_get_var (dict, idx + i);
1363 int alloc_width = sfm_segment_alloc_width (length, i);
1364 int width = var_get_width (seg);
1367 var_set_short_name (var, i, var_get_short_name (seg, 0));
1368 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1369 sys_error (r, _("Very long string with width %ld has segment %d "
1370 "of width %d (expected %d)."),
1371 length, i, width, alloc_width);
1373 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1374 var_set_width (var, length);
1376 close_text_record (r, text);
1377 dict_compact_values (dict);
1380 /* Reads value labels from sysfile H and inserts them into the
1381 associated dictionary. */
1383 read_value_labels (struct sfm_reader *r,
1384 struct dictionary *dict, struct variable **var_by_value_idx)
1386 struct pool *subpool;
1390 uint8_t raw_value[8]; /* Value as uninterpreted bytes. */
1391 union value value; /* Value. */
1392 char *label; /* Null-terminated label string. */
1395 struct label *labels = NULL;
1396 int label_cnt; /* Number of labels. */
1398 struct variable **var = NULL; /* Associated variables. */
1399 int var_cnt; /* Number of associated variables. */
1400 int max_width; /* Maximum width of string variables. */
1404 subpool = pool_create_subpool (r->pool);
1406 /* Read the type 3 record and record its contents. We can't do
1407 much with the data yet because we don't know whether it is
1408 of numeric or string type. */
1410 /* Read number of labels. */
1411 label_cnt = read_int (r);
1413 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1415 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1420 /* Read each value/label tuple into labels[]. */
1421 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1422 for (i = 0; i < label_cnt; i++)
1424 struct label *label = labels + i;
1425 unsigned char label_len;
1429 read_bytes (r, label->raw_value, sizeof label->raw_value);
1431 /* Read label length. */
1432 read_bytes (r, &label_len, sizeof label_len);
1433 padded_len = ROUND_UP (label_len + 1, 8);
1435 /* Read label, padding. */
1436 label->label = pool_alloc (subpool, padded_len + 1);
1437 read_bytes (r, label->label, padded_len - 1);
1438 label->label[label_len] = 0;
1441 /* Now, read the type 4 record that has the list of variables
1442 to which the value labels are to be applied. */
1444 /* Read record type of type 4 record. */
1445 if (read_int (r) != 4)
1446 sys_error (r, _("Variable index record (type 4) does not immediately "
1447 "follow value label record (type 3) as it should."));
1449 /* Read number of variables associated with value label from type 4
1451 var_cnt = read_int (r);
1452 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1453 sys_error (r, _("Number of variables associated with a value label (%d) "
1454 "is not between 1 and the number of variables (%zu)."),
1455 var_cnt, dict_get_var_cnt (dict));
1457 /* Read the list of variables. */
1458 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1460 for (i = 0; i < var_cnt; i++)
1462 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1463 if (var_get_width (var[i]) > 8)
1464 sys_error (r, _("Value labels may not be added to long string "
1465 "variables (e.g. %s) using records types 3 and 4."),
1466 var_get_name (var[i]));
1467 max_width = MAX (max_width, var_get_width (var[i]));
1470 /* Type check the variables. */
1471 for (i = 1; i < var_cnt; i++)
1472 if (var_get_type (var[i]) != var_get_type (var[0]))
1473 sys_error (r, _("Variables associated with value label are not all of "
1474 "identical type. Variable %s is %s, but variable "
1476 var_get_name (var[0]),
1477 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1478 var_get_name (var[i]),
1479 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1481 /* Fill in labels[].value, now that we know the desired type. */
1482 for (i = 0; i < label_cnt; i++)
1484 struct label *label = labels + i;
1486 value_init_pool (subpool, &label->value, max_width);
1487 if (var_is_alpha (var[0]))
1488 u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
1489 label->raw_value, sizeof label->raw_value, ' ');
1491 label->value.f = float_get_double (r->float_format, label->raw_value);
1494 /* Assign the `value_label's to each variable. */
1495 for (i = 0; i < var_cnt; i++)
1497 struct variable *v = var[i];
1500 /* Add each label to the variable. */
1501 for (j = 0; j < label_cnt; j++)
1503 struct label *label = &labels[j];
1504 if (!var_add_value_label (v, &label->value, label->label))
1506 if (var_is_numeric (var[0]))
1507 sys_warn (r, _("Duplicate value label for %g on %s."),
1508 label->value.f, var_get_name (v));
1510 sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
1511 max_width, value_str (&label->value, max_width),
1517 pool_destroy (subpool);
1520 /* Reads a set of custom attributes from TEXT into ATTRS.
1521 ATTRS may be a null pointer, in which case the attributes are
1522 read but discarded. */
1524 read_attributes (struct sfm_reader *r, struct text_record *text,
1525 struct attrset *attrs)
1529 struct attribute *attr;
1533 /* Parse the key. */
1534 key = text_get_token (text, ss_cstr ("("), NULL);
1538 attr = attribute_create (key);
1539 for (index = 1; ; index++)
1541 /* Parse the value. */
1545 value = text_get_token (text, ss_cstr ("\n"), NULL);
1548 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1553 length = strlen (value);
1554 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1556 value[length - 1] = '\0';
1557 attribute_add_value (attr, value + 1);
1562 _("Attribute value %s[%d] is not quoted: %s."),
1564 attribute_add_value (attr, value);
1567 /* Was this the last value for this attribute? */
1568 if (text_match (text, ')'))
1572 attrset_add (attrs, attr);
1574 attribute_destroy (attr);
1576 while (!text_match (text, '/'));
1579 /* Reads record type 7, subtype 17, which lists custom
1580 attributes on the data file. */
1582 read_data_file_attributes (struct sfm_reader *r,
1583 size_t size, size_t count,
1584 struct dictionary *dict)
1586 struct text_record *text = open_text_record (r, size * count);
1587 read_attributes (r, text, dict_get_attributes (dict));
1588 close_text_record (r, text);
1592 skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels)
1596 for (i = 0; i < n_labels; i++)
1598 size_t value_length, label_length;
1600 value_length = read_int (r);
1601 skip_bytes (r, value_length);
1602 label_length = read_int (r);
1603 skip_bytes (r, label_length);
1608 read_long_string_value_labels (struct sfm_reader *r,
1609 size_t size, size_t count,
1610 struct dictionary *d)
1612 const off_t start = ftello (r->file);
1613 while (ftello (r->file) - start < size * count)
1615 char var_name[VAR_NAME_LEN + 1];
1623 var_name_len = read_int (r);
1624 if (var_name_len > VAR_NAME_LEN)
1625 sys_error (r, _("Variable name length in long string value label "
1626 "record (%d) exceeds %d-byte limit."),
1627 var_name_len, VAR_NAME_LEN);
1628 read_string (r, var_name, var_name_len + 1);
1629 width = read_int (r);
1630 n_labels = read_int (r);
1632 v = dict_lookup_var (d, var_name);
1635 sys_warn (r, _("Ignoring long string value record for "
1636 "unknown variable %s."), var_name);
1637 skip_long_string_value_labels (r, n_labels);
1640 if (var_is_numeric (v))
1642 sys_warn (r, _("Ignoring long string value record for "
1643 "numeric variable %s."), var_name);
1644 skip_long_string_value_labels (r, n_labels);
1647 if (width != var_get_width (v))
1649 sys_warn (r, _("Ignoring long string value record for variable %s "
1650 "because the record's width (%d) does not match the "
1651 "variable's width (%d)."),
1652 var_name, width, var_get_width (v));
1653 skip_long_string_value_labels (r, n_labels);
1658 value_init_pool (r->pool, &value, width);
1659 for (i = 0; i < n_labels; i++)
1661 size_t value_length, label_length;
1666 value_length = read_int (r);
1667 if (value_length == width)
1668 read_bytes (r, value_str_rw (&value, width), width);
1671 sys_warn (r, _("Ignoring long string value %zu for variable %s, "
1672 "with width %d, that has bad value width %zu."),
1673 i, var_get_name (v), width, value_length);
1674 skip_bytes (r, value_length);
1679 label_length = read_int (r);
1680 read_string (r, label, MIN (sizeof label, label_length + 1));
1681 if (label_length >= sizeof label)
1683 /* Skip and silently ignore label text after the
1684 first 255 bytes. The maximum documented length
1685 of a label is 120 bytes so this is more than
1687 skip_bytes (r, (label_length + 1) - sizeof label);
1690 if (!skip && !var_add_value_label (v, &value, label))
1691 sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
1692 width, value_str (&value, width), var_get_name (v));
1698 /* Reads record type 7, subtype 18, which lists custom
1699 attributes on individual variables. */
1701 read_variable_attributes (struct sfm_reader *r,
1702 size_t size, size_t count,
1703 struct dictionary *dict)
1705 struct text_record *text = open_text_record (r, size * count);
1708 struct variable *var;
1709 if (!text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1711 read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1713 close_text_record (r, text);
1719 static void partial_record (struct sfm_reader *r)
1722 static void read_error (struct casereader *, const struct sfm_reader *);
1724 static bool read_case_number (struct sfm_reader *, double *);
1725 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1726 static int read_opcode (struct sfm_reader *);
1727 static bool read_compressed_number (struct sfm_reader *, double *);
1728 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1729 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1730 static bool skip_whole_strings (struct sfm_reader *, size_t);
1732 /* Reads and returns one case from READER's file. Returns a null
1733 pointer if not successful. */
1734 static struct ccase *
1735 sys_file_casereader_read (struct casereader *reader, void *r_)
1737 struct sfm_reader *r = r_;
1738 struct ccase *volatile c;
1744 c = case_create (r->proto);
1745 if (setjmp (r->bail_out))
1747 casereader_force_error (reader);
1752 for (i = 0; i < r->sfm_var_cnt; i++)
1754 struct sfm_var *sv = &r->sfm_vars[i];
1755 union value *v = case_data_rw_idx (c, sv->case_index);
1757 if (sv->var_width == 0)
1759 if (!read_case_number (r, &v->f))
1764 uint8_t *s = value_str_rw (v, sv->var_width);
1765 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1767 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1776 if (r->case_cnt != -1)
1777 read_error (reader, r);
1782 /* Issues an error that R ends in a partial record. */
1784 partial_record (struct sfm_reader *r)
1786 sys_error (r, _("File ends in partial case."));
1789 /* Issues an error that an unspecified error occurred SFM, and
1792 read_error (struct casereader *r, const struct sfm_reader *sfm)
1794 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1795 casereader_force_error (r);
1798 /* Reads a number from R and stores its value in *D.
1799 If R is compressed, reads a compressed number;
1800 otherwise, reads a number in the regular way.
1801 Returns true if successful, false if end of file is
1802 reached immediately. */
1804 read_case_number (struct sfm_reader *r, double *d)
1809 if (!try_read_bytes (r, number, sizeof number))
1811 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1815 return read_compressed_number (r, d);
1818 /* Reads LENGTH string bytes from R into S.
1819 Always reads a multiple of 8 bytes; if LENGTH is not a
1820 multiple of 8, then extra bytes are read and discarded without
1822 Reads compressed strings if S is compressed.
1823 Returns true if successful, false if end of file is
1824 reached immediately. */
1826 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
1828 size_t whole = ROUND_DOWN (length, 8);
1829 size_t partial = length % 8;
1833 if (!read_whole_strings (r, s, whole))
1840 if (!read_whole_strings (r, bounce, sizeof bounce))
1846 memcpy (s + whole, bounce, partial);
1852 /* Reads and returns the next compression opcode from R. */
1854 read_opcode (struct sfm_reader *r)
1856 assert (r->compressed);
1860 if (r->opcode_idx >= sizeof r->opcodes)
1862 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1866 opcode = r->opcodes[r->opcode_idx++];
1873 /* Reads a compressed number from R and stores its value in D.
1874 Returns true if successful, false if end of file is
1875 reached immediately. */
1877 read_compressed_number (struct sfm_reader *r, double *d)
1879 int opcode = read_opcode (r);
1887 *d = read_float (r);
1891 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
1892 if (!r->corruption_warning)
1894 r->corruption_warning = true;
1895 sys_warn (r, _("Possible compressed data corruption: "
1896 "compressed spaces appear in numeric field."));
1905 *d = opcode - r->bias;
1912 /* Reads a compressed 8-byte string segment from R and stores it
1914 Returns true if successful, false if end of file is
1915 reached immediately. */
1917 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
1919 int opcode = read_opcode (r);
1927 read_bytes (r, dst, 8);
1931 memset (dst, ' ', 8);
1936 double value = opcode - r->bias;
1937 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
1940 /* This has actually been seen "in the wild". The submitter of the
1941 file that showed that the contents decoded as spaces, but they
1942 were at the end of the field so it's possible that the null
1943 bytes just acted as null terminators. */
1945 else if (!r->corruption_warning)
1947 r->corruption_warning = true;
1948 sys_warn (r, _("Possible compressed data corruption: "
1949 "string contains compressed integer (opcode %d)."),
1959 /* Reads LENGTH string bytes from R into S.
1960 LENGTH must be a multiple of 8.
1961 Reads compressed strings if S is compressed.
1962 Returns true if successful, false if end of file is
1963 reached immediately. */
1965 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
1967 assert (length % 8 == 0);
1969 return try_read_bytes (r, s, length);
1973 for (ofs = 0; ofs < length; ofs += 8)
1974 if (!read_compressed_string (r, s + ofs))
1984 /* Skips LENGTH string bytes from R.
1985 LENGTH must be a multiple of 8.
1986 (LENGTH is also limited to 1024, but that's only because the
1987 current caller never needs more than that many bytes.)
1988 Returns true if successful, false if end of file is
1989 reached immediately. */
1991 skip_whole_strings (struct sfm_reader *r, size_t length)
1993 uint8_t buffer[1024];
1994 assert (length < sizeof buffer);
1995 return read_whole_strings (r, buffer, length);
1998 /* Creates and returns a table that can be used for translating a value
1999 index into a case to a "struct variable *" for DICT. Multiple
2000 system file fields reference variables this way.
2002 This table must be created before processing the very long
2003 string extension record, because that record causes some
2004 values to be deleted from the case and the dictionary to be
2006 static struct variable **
2007 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
2009 struct variable **var_by_value_idx;
2013 var_by_value_idx = pool_nmalloc (r->pool,
2014 r->oct_cnt, sizeof *var_by_value_idx);
2015 for (i = 0; i < dict_get_var_cnt (dict); i++)
2017 struct variable *v = dict_get_var (dict, i);
2018 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
2021 var_by_value_idx[value_idx++] = v;
2022 for (j = 1; j < nv; j++)
2023 var_by_value_idx[value_idx++] = NULL;
2025 assert (value_idx == r->oct_cnt);
2027 return var_by_value_idx;
2030 /* Returns the "struct variable" corresponding to the given
2031 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
2033 static struct variable *
2034 lookup_var_by_value_idx (struct sfm_reader *r,
2035 struct variable **var_by_value_idx, int value_idx)
2037 struct variable *var;
2039 if (value_idx < 1 || value_idx > r->oct_cnt)
2040 sys_error (r, _("Variable index %d not in valid range 1...%d."),
2041 value_idx, r->oct_cnt);
2043 var = var_by_value_idx[value_idx - 1];
2045 sys_error (r, _("Variable index %d refers to long string "
2052 /* Returns the variable in D with the given SHORT_NAME,
2053 or a null pointer if there is none. */
2054 static struct variable *
2055 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
2057 struct variable *var;
2061 /* First try looking up by full name. This often succeeds. */
2062 var = dict_lookup_var (d, short_name);
2063 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
2066 /* Iterate through the whole dictionary as a fallback. */
2067 var_cnt = dict_get_var_cnt (d);
2068 for (i = 0; i < var_cnt; i++)
2070 var = dict_get_var (d, i);
2071 if (!strcasecmp (var_get_short_name (var, 0), short_name))
2078 /* Helpers for reading records that contain structured text
2081 /* Maximum number of warnings to issue for a single text
2083 #define MAX_TEXT_WARNINGS 5
2088 struct substring buffer; /* Record contents. */
2089 size_t pos; /* Current position in buffer. */
2090 int n_warnings; /* Number of warnings issued or suppressed. */
2093 /* Reads SIZE bytes into a text record for R,
2094 and returns the new text record. */
2095 static struct text_record *
2096 open_text_record (struct sfm_reader *r, size_t size)
2098 struct text_record *text = pool_alloc (r->pool, sizeof *text);
2099 char *buffer = pool_malloc (r->pool, size + 1);
2100 read_bytes (r, buffer, size);
2101 text->buffer = ss_buffer (buffer, size);
2103 text->n_warnings = 0;
2107 /* Closes TEXT, frees its storage, and issues a final warning
2108 about suppressed warnings if necesary. */
2110 close_text_record (struct sfm_reader *r, struct text_record *text)
2112 if (text->n_warnings > MAX_TEXT_WARNINGS)
2113 sys_warn (r, _("Suppressed %d additional related warnings."),
2114 text->n_warnings - MAX_TEXT_WARNINGS);
2115 pool_free (r->pool, ss_data (text->buffer));
2118 /* Reads a variable=value pair from TEXT.
2119 Looks up the variable in DICT and stores it into *VAR.
2120 Stores a null-terminated value into *VALUE. */
2122 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2123 struct text_record *text,
2124 struct variable **var, char **value)
2128 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2131 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2135 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2136 ss_buffer ("\t\0", 2));
2144 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2145 struct text_record *text, struct substring delimiters,
2146 struct variable **var)
2150 name = text_get_token (text, delimiters, NULL);
2154 *var = dict_lookup_var (dict, name);
2158 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2165 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2166 struct text_record *text, struct substring delimiters,
2167 struct variable **var)
2169 char *short_name = text_get_token (text, delimiters, NULL);
2170 if (short_name == NULL)
2173 *var = lookup_var_by_short_name (dict, short_name);
2175 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2180 /* Displays a warning for the current file position, limiting the
2181 number to MAX_TEXT_WARNINGS for TEXT. */
2183 text_warn (struct sfm_reader *r, struct text_record *text,
2184 const char *format, ...)
2186 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2190 va_start (args, format);
2191 sys_msg (r, MW, format, args);
2197 text_get_token (struct text_record *text, struct substring delimiters,
2200 struct substring token;
2203 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2206 end = &ss_data (token)[ss_length (token)];
2207 if (delimiter != NULL)
2210 return ss_data (token);
2213 /* Reads a integer value expressed in decimal, then a space, then a string that
2214 consists of exactly as many bytes as specified by the integer, then a space,
2215 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2216 buffer (so the caller should not free the string). */
2218 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2228 int c = text->buffer.string[text->pos];
2229 if (c < '0' || c > '9')
2231 n = (n * 10) + (c - '0');
2234 if (start == text->pos)
2236 sys_warn (r, _("Expecting digit at offset %zu in MRSETS record."),
2241 if (!text_match (text, ' '))
2243 sys_warn (r, _("Expecting space at offset %zu in MRSETS record."),
2248 if (text->pos + n > text->buffer.length)
2250 sys_warn (r, _("%zu-byte string starting at offset %zu "
2251 "exceeds record length %zu."),
2252 n, text->pos, text->buffer.length);
2256 s = &text->buffer.string[text->pos];
2260 _("Expecting space at offset %zu following %zu-byte string."),
2270 text_match (struct text_record *text, char c)
2272 if (text->buffer.string[text->pos] == c)
2281 /* Returns the current byte offset inside the TEXT's string. */
2283 text_pos (const struct text_record *text)
2290 /* Displays a corruption message. */
2292 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
2297 ds_init_empty (&text);
2298 ds_put_format (&text, "`%s' near offset 0x%llx: ",
2299 fh_get_file_name (r->fh), (long long int) ftello (r->file));
2300 ds_put_vformat (&text, format, args);
2302 m.category = msg_class_to_category (class);
2303 m.severity = msg_class_to_severity (class);
2304 m.where.file_name = NULL;
2305 m.where.line_number = 0;
2306 m.where.first_column = 0;
2307 m.where.last_column = 0;
2308 m.text = ds_cstr (&text);
2313 /* Displays a warning for the current file position. */
2315 sys_warn (struct sfm_reader *r, const char *format, ...)
2319 va_start (args, format);
2320 sys_msg (r, MW, format, args);
2324 /* Displays an error for the current file position,
2325 marks it as in an error state,
2326 and aborts reading it using longjmp. */
2328 sys_error (struct sfm_reader *r, const char *format, ...)
2332 va_start (args, format);
2333 sys_msg (r, ME, format, args);
2337 longjmp (r->bail_out, 1);
2340 /* Reads BYTE_CNT bytes into BUF.
2341 Returns true if exactly BYTE_CNT bytes are successfully read.
2342 Aborts if an I/O error or a partial read occurs.
2343 If EOF_IS_OK, then an immediate end-of-file causes false to be
2344 returned; otherwise, immediate end-of-file causes an abort
2347 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2348 void *buf, size_t byte_cnt)
2350 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2351 if (bytes_read == byte_cnt)
2353 else if (ferror (r->file))
2354 sys_error (r, _("System error: %s."), strerror (errno));
2355 else if (!eof_is_ok || bytes_read != 0)
2356 sys_error (r, _("Unexpected end of file."));
2361 /* Reads BYTE_CNT into BUF.
2362 Aborts upon I/O error or if end-of-file is encountered. */
2364 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2366 read_bytes_internal (r, false, buf, byte_cnt);
2369 /* Reads BYTE_CNT bytes into BUF.
2370 Returns true if exactly BYTE_CNT bytes are successfully read.
2371 Returns false if an immediate end-of-file is encountered.
2372 Aborts if an I/O error or a partial read occurs. */
2374 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2376 return read_bytes_internal (r, true, buf, byte_cnt);
2379 /* Reads a 32-bit signed integer from R and returns its value in
2382 read_int (struct sfm_reader *r)
2385 read_bytes (r, integer, sizeof integer);
2386 return integer_get (r->integer_format, integer, sizeof integer);
2389 /* Reads a 64-bit floating-point number from R and returns its
2390 value in host format. */
2392 read_float (struct sfm_reader *r)
2395 read_bytes (r, number, sizeof number);
2396 return float_get_double (r->float_format, number);
2399 /* Reads exactly SIZE - 1 bytes into BUFFER
2400 and stores a null byte into BUFFER[SIZE - 1]. */
2402 read_string (struct sfm_reader *r, char *buffer, size_t size)
2405 read_bytes (r, buffer, size - 1);
2406 buffer[size - 1] = '\0';
2409 /* Skips BYTES bytes forward in R. */
2411 skip_bytes (struct sfm_reader *r, size_t bytes)
2416 size_t chunk = MIN (sizeof buffer, bytes);
2417 read_bytes (r, buffer, chunk);
2422 static const struct casereader_class sys_file_casereader_class =
2424 sys_file_casereader_read,
2425 sys_file_casereader_destroy,