1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/mrset.h"
38 #include "data/short-names.h"
39 #include "data/value-labels.h"
40 #include "data/value.h"
41 #include "data/variable.h"
42 #include "libpspp/array.h"
43 #include "libpspp/assertion.h"
44 #include "libpspp/compiler.h"
45 #include "libpspp/hash.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/minmax.h"
56 #include "gl/unlocked-io.h"
57 #include "gl/xalloc.h"
61 #define _(msgid) gettext (msgid)
62 #define N_(msgid) (msgid)
64 /* System file reader. */
67 /* Resource tracking. */
68 struct pool *pool; /* All system file state. */
69 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct file_handle *fh; /* File handle. */
73 struct fh_lock *lock; /* Mutual exclusion for file handle. */
74 FILE *file; /* File stream. */
75 bool error; /* I/O or corruption error? */
76 struct caseproto *proto; /* Format of output cases. */
79 enum integer_format integer_format; /* On-disk integer format. */
80 enum float_format float_format; /* On-disk floating point format. */
81 int oct_cnt; /* Number of 8-byte units per case. */
82 struct sfm_var *sfm_vars; /* Variables. */
83 size_t sfm_var_cnt; /* Number of variables. */
84 casenumber case_cnt; /* Number of cases */
85 bool has_long_var_names; /* File has a long variable name map */
88 bool compressed; /* File is compressed? */
89 double bias; /* Compression bias, usually 100.0. */
90 uint8_t opcodes[8]; /* Current block of opcodes. */
91 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
92 bool corruption_warning; /* Warned about possible corruption? */
95 static const struct casereader_class sys_file_casereader_class;
97 static bool close_reader (struct sfm_reader *);
99 static struct variable **make_var_by_value_idx (struct sfm_reader *,
100 struct dictionary *);
101 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
104 static struct variable *lookup_var_by_short_name (struct dictionary *,
105 const char *short_name);
107 static void sys_msg (struct sfm_reader *r, int class,
108 const char *format, va_list args)
109 PRINTF_FORMAT (3, 0);
110 static void sys_warn (struct sfm_reader *, const char *, ...)
111 PRINTF_FORMAT (2, 3);
112 static void sys_error (struct sfm_reader *, const char *, ...)
116 static void read_bytes (struct sfm_reader *, void *, size_t);
117 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
118 static int read_int (struct sfm_reader *);
119 static double read_float (struct sfm_reader *);
120 static void read_string (struct sfm_reader *, char *, size_t);
121 static void skip_bytes (struct sfm_reader *, size_t);
123 static struct text_record *open_text_record (struct sfm_reader *, size_t size);
124 static void close_text_record (struct sfm_reader *r,
125 struct text_record *);
126 static bool read_variable_to_value_pair (struct sfm_reader *,
128 struct text_record *,
129 struct variable **var, char **value);
130 static void text_warn (struct sfm_reader *r, struct text_record *text,
131 const char *format, ...)
132 PRINTF_FORMAT (3, 4);
133 static char *text_get_token (struct text_record *,
134 struct substring delimiters, char *delimiter);
135 static bool text_match (struct text_record *, char c);
136 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
137 struct text_record *,
138 struct substring delimiters,
140 static const char *text_parse_counted_string (struct sfm_reader *,
141 struct text_record *);
142 static size_t text_pos (const struct text_record *);
144 static bool close_reader (struct sfm_reader *r);
146 /* Dictionary reader. */
154 static void read_header (struct sfm_reader *, struct dictionary *,
155 int *weight_idx, int *claimed_oct_cnt,
156 struct sfm_read_info *);
157 static void read_variable_record (struct sfm_reader *, struct dictionary *,
158 int *format_warning_cnt);
159 static void parse_format_spec (struct sfm_reader *, unsigned int,
160 enum which_format, struct variable *,
161 int *format_warning_cnt);
162 static void setup_weight (struct sfm_reader *, int weight_idx,
163 struct variable **var_by_value_idx,
164 struct dictionary *);
165 static void read_documents (struct sfm_reader *, struct dictionary *);
166 static void read_value_labels (struct sfm_reader *, struct dictionary *,
167 struct variable **var_by_value_idx);
169 static void read_extension_record (struct sfm_reader *, struct dictionary *,
170 struct sfm_read_info *);
171 static void read_machine_integer_info (struct sfm_reader *,
172 size_t size, size_t count,
173 struct sfm_read_info *,
176 static void read_machine_float_info (struct sfm_reader *,
177 size_t size, size_t count);
178 static void read_mrsets (struct sfm_reader *, size_t size, size_t count,
179 struct dictionary *);
180 static void read_display_parameters (struct sfm_reader *,
181 size_t size, size_t count,
182 struct dictionary *);
183 static void read_long_var_name_map (struct sfm_reader *,
184 size_t size, size_t count,
185 struct dictionary *);
186 static void read_long_string_map (struct sfm_reader *,
187 size_t size, size_t count,
188 struct dictionary *);
189 static void read_data_file_attributes (struct sfm_reader *,
190 size_t size, size_t count,
191 struct dictionary *);
192 static void read_variable_attributes (struct sfm_reader *,
193 size_t size, size_t count,
194 struct dictionary *);
195 static void read_long_string_value_labels (struct sfm_reader *,
196 size_t size, size_t count,
197 struct dictionary *);
199 /* Convert all the strings in DICT from the dict encoding to UTF8 */
201 recode_strings (struct dictionary *dict)
205 const char *enc = dict_get_encoding (dict);
208 enc = get_default_encoding ();
210 for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
212 /* Convert the long variable name */
213 struct variable *var = dict_get_var (dict, i);
214 const char *native_name = var_get_name (var);
215 char *utf8_name = recode_string (UTF8, enc, native_name, -1);
216 if ( 0 != strcmp (utf8_name, native_name))
218 if ( NULL == dict_lookup_var (dict, utf8_name))
219 dict_rename_var (dict, var, utf8_name);
222 _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name);
227 /* Convert the variable label */
228 if (var_has_label (var))
230 char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
231 var_set_label (var, utf8_label);
235 if (var_has_value_labels (var))
237 const struct val_lab *vl = NULL;
238 const struct val_labs *vlabs = var_get_value_labels (var);
240 for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl))
242 const union value *val = val_lab_get_value (vl);
243 const char *label = val_lab_get_label (vl);
244 char *new_label = NULL;
246 new_label = recode_string (UTF8, enc, label, -1);
248 var_replace_value_label (var, val, new_label);
255 /* Opens the system file designated by file handle FH for
256 reading. Reads the system file's dictionary into *DICT.
257 If INFO is non-null, then it receives additional info about the
260 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
261 struct sfm_read_info *volatile info)
263 struct sfm_reader *volatile r = NULL;
264 struct variable **var_by_value_idx;
265 struct sfm_read_info local_info;
266 int format_warning_cnt = 0;
271 *dict = dict_create ();
273 /* Create and initialize reader. */
274 r = pool_create_container (struct sfm_reader, pool);
280 r->has_long_var_names = false;
281 r->opcode_idx = sizeof r->opcodes;
282 r->corruption_warning = false;
284 /* TRANSLATORS: this fragment will be interpolated into
285 messages in fh_lock() that identify types of files. */
286 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
290 r->file = fn_open (fh_get_file_name (fh), "rb");
293 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
294 fh_get_file_name (r->fh), strerror (errno));
298 /* Initialize info. */
301 memset (info, 0, sizeof *info);
303 if (setjmp (r->bail_out))
308 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
310 /* Read all the variable definition records. */
311 rec_type = read_int (r);
312 while (rec_type == 2)
314 read_variable_record (r, *dict, &format_warning_cnt);
315 rec_type = read_int (r);
318 /* Figure out the case format. */
319 var_by_value_idx = make_var_by_value_idx (r, *dict);
320 setup_weight (r, weight_idx, var_by_value_idx, *dict);
322 /* Read all the rest of the dictionary records. */
323 while (rec_type != 999)
328 read_value_labels (r, *dict, var_by_value_idx);
332 sys_error (r, _("Misplaced type 4 record."));
335 read_documents (r, *dict);
339 read_extension_record (r, *dict, info);
343 sys_error (r, _("Unrecognized record type %d."), rec_type);
345 rec_type = read_int (r);
349 if ( ! r->has_long_var_names )
352 for (i = 0; i < dict_get_var_cnt (*dict); i++)
354 struct variable *var = dict_get_var (*dict, i);
355 char short_name[SHORT_NAME_LEN + 1];
356 char long_name[SHORT_NAME_LEN + 1];
358 strcpy (short_name, var_get_name (var));
360 strcpy (long_name, short_name);
361 str_lowercase (long_name);
363 /* Set long name. Renaming a variable may clear the short
364 name, but we want to retain it, so re-set it
366 dict_rename_var (*dict, var, long_name);
367 var_set_short_name (var, 0, short_name);
370 r->has_long_var_names = true;
373 recode_strings (*dict);
375 /* Read record 999 data, which is just filler. */
378 /* Warn if the actual amount of data per case differs from the
379 amount that the header claims. SPSS version 13 gets this
380 wrong when very long strings are involved, so don't warn in
382 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
383 && info->version_major != 13)
384 sys_warn (r, _("File header claims %d variable positions but "
385 "%d were read from file."),
386 claimed_oct_cnt, r->oct_cnt);
388 /* Create an index of dictionary variable widths for
389 sfm_read_case to use. We cannot use the `struct variable's
390 from the dictionary we created, because the caller owns the
391 dictionary and may destroy or modify its variables. */
392 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
393 pool_register (r->pool, free, r->sfm_vars);
394 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
396 pool_free (r->pool, var_by_value_idx);
397 return casereader_create_sequential
399 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
400 &sys_file_casereader_class, r);
404 dict_destroy (*dict);
409 /* Closes a system file after we're done with it.
410 Returns true if an I/O error has occurred on READER, false
413 close_reader (struct sfm_reader *r)
422 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
424 msg (ME, _("Error closing system file \"%s\": %s."),
425 fh_get_file_name (r->fh), strerror (errno));
435 pool_destroy (r->pool);
440 /* Destroys READER. */
442 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
444 struct sfm_reader *r = r_;
448 /* Returns true if FILE is an SPSS system file,
451 sfm_detect (FILE *file)
455 if (fread (rec_type, 4, 1, file) != 1)
459 return !strcmp ("$FL2", rec_type);
462 /* Reads the global header of the system file.
463 Sets DICT's file label to the system file's label.
464 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
465 or to the value index of the weight variable otherwise.
466 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
467 per case that the file claims to have (although it is not
469 Initializes INFO with header information. */
471 read_header (struct sfm_reader *r, struct dictionary *dict,
472 int *weight_idx, int *claimed_oct_cnt,
473 struct sfm_read_info *info)
476 char eye_catcher[61];
477 uint8_t raw_layout_code[4];
479 char creation_date[10];
480 char creation_time[9];
482 struct substring file_label_ss;
483 struct substring product;
485 read_string (r, rec_type, sizeof rec_type);
486 read_string (r, eye_catcher, sizeof eye_catcher);
488 if (strcmp ("$FL2", rec_type) != 0)
489 sys_error (r, _("This is not an SPSS system file."));
491 /* Identify integer format. */
492 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
493 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
495 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
497 || (r->integer_format != INTEGER_MSB_FIRST
498 && r->integer_format != INTEGER_LSB_FIRST))
499 sys_error (r, _("This is not an SPSS system file."));
501 *claimed_oct_cnt = read_int (r);
502 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
503 *claimed_oct_cnt = -1;
505 r->compressed = read_int (r) != 0;
507 *weight_idx = read_int (r);
509 r->case_cnt = read_int (r);
510 if ( r->case_cnt > INT_MAX / 2)
514 /* Identify floating-point format and obtain compression bias. */
515 read_bytes (r, raw_bias, sizeof raw_bias);
516 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
518 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
520 if (memcmp (raw_bias, zero_bias, 8))
521 sys_warn (r, _("Compression bias is not the usual "
522 "value of 100, or system file uses unrecognized "
523 "floating-point format."));
526 /* Some software is known to write all-zeros to this
527 field. Such software also writes floating-point
528 numbers in the format that we expect by default
529 (it seems that all software most likely does, in
530 reality), so don't warn in this case. */
533 if (r->integer_format == INTEGER_MSB_FIRST)
534 r->float_format = FLOAT_IEEE_DOUBLE_BE;
536 r->float_format = FLOAT_IEEE_DOUBLE_LE;
538 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
540 read_string (r, creation_date, sizeof creation_date);
541 read_string (r, creation_time, sizeof creation_time);
542 read_string (r, file_label, sizeof file_label);
545 file_label_ss = ss_cstr (file_label);
546 ss_trim (&file_label_ss, ss_cstr (" "));
547 if (!ss_is_empty (file_label_ss))
549 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
550 dict_set_label (dict, ss_data (file_label_ss));
553 strcpy (info->creation_date, creation_date);
554 strcpy (info->creation_time, creation_time);
555 info->integer_format = r->integer_format;
556 info->float_format = r->float_format;
557 info->compressed = r->compressed;
558 info->case_cnt = r->case_cnt;
560 product = ss_cstr (eye_catcher);
561 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
562 ss_trim (&product, ss_cstr (" "));
563 str_copy_buf_trunc (info->product, sizeof info->product,
564 ss_data (product), ss_length (product));
567 /* Reads a variable (type 2) record from R and adds the
568 corresponding variable to DICT.
569 Also skips past additional variable records for long string
572 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
573 int *format_warning_cnt)
576 int has_variable_label;
577 int missing_value_code;
582 struct variable *var;
585 width = read_int (r);
586 has_variable_label = read_int (r);
587 missing_value_code = read_int (r);
588 print_format = read_int (r);
589 write_format = read_int (r);
590 read_string (r, name, sizeof name);
591 name[strcspn (name, " ")] = '\0';
593 /* Check variable name. */
594 if (name[0] == '$' || name[0] == '#')
595 sys_error (r, "Variable name begins with invalid character `%c'.",
597 if (!var_is_plausible_name (name, false))
598 sys_error (r, _("Invalid variable name `%s'."), name);
600 /* Create variable. */
601 if (width < 0 || width > 255)
602 sys_error (r, _("Bad width %d for variable %s."), width, name);
603 var = dict_create_var (dict, name, width);
606 _("Duplicate variable name `%s' within system file."),
609 /* Set the short name the same as the long name. */
610 var_set_short_name (var, 0, var_get_name (var));
612 /* Get variable label, if any. */
613 if (has_variable_label != 0 && has_variable_label != 1)
614 sys_error (r, _("Variable label indicator field is not 0 or 1."));
615 if (has_variable_label == 1)
621 if (len >= sizeof label)
622 sys_error (r, _("Variable %s has label of invalid length %zu."),
624 read_string (r, label, len + 1);
625 var_set_label (var, label);
627 skip_bytes (r, ROUND_UP (len, 4) - len);
630 /* Set missing values. */
631 if (missing_value_code != 0)
633 struct missing_values mv;
636 mv_init_pool (r->pool, &mv, var_get_width (var));
637 if (var_is_numeric (var))
639 if (missing_value_code < -3 || missing_value_code > 3
640 || missing_value_code == -1)
641 sys_error (r, _("Numeric missing value indicator field is not "
642 "-3, -2, 0, 1, 2, or 3."));
643 if (missing_value_code < 0)
645 double low = read_float (r);
646 double high = read_float (r);
647 mv_add_range (&mv, low, high);
648 missing_value_code = -missing_value_code - 2;
650 for (i = 0; i < missing_value_code; i++)
651 mv_add_num (&mv, read_float (r));
655 int mv_width = MAX (width, 8);
658 if (missing_value_code < 1 || missing_value_code > 3)
659 sys_error (r, _("String missing value indicator field is not "
662 value_init (&value, mv_width);
663 value_set_missing (&value, mv_width);
664 for (i = 0; i < missing_value_code; i++)
666 uint8_t *s = value_str_rw (&value, mv_width);
667 read_bytes (r, s, 8);
670 value_destroy (&value, mv_width);
672 var_set_missing_values (var, &mv);
676 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
677 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
679 /* Account for values.
680 Skip long string continuation records, if any. */
681 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
687 for (i = 1; i < nv; i++)
689 /* Check for record type 2 and width -1. */
690 if (read_int (r) != 2 || read_int (r) != -1)
691 sys_error (r, _("Missing string continuation record."));
693 /* Skip and ignore remaining continuation data. */
694 has_variable_label = read_int (r);
695 missing_value_code = read_int (r);
696 print_format = read_int (r);
697 write_format = read_int (r);
698 read_string (r, name, sizeof name);
700 /* Variable label fields on continuation records have
701 been spotted in system files created by "SPSS Power
702 Macintosh Release 6.1". */
703 if (has_variable_label)
704 skip_bytes (r, ROUND_UP (read_int (r), 4));
709 /* Translates the format spec from sysfile format to internal
712 parse_format_spec (struct sfm_reader *r, unsigned int s,
713 enum which_format which, struct variable *v,
714 int *format_warning_cnt)
716 const int max_format_warnings = 8;
718 uint8_t raw_type = s >> 16;
724 if (!fmt_from_io (raw_type, &f.type))
725 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
730 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
735 if (which == PRINT_FORMAT)
736 var_set_print_format (v, &f);
738 var_set_write_format (v, &f);
740 else if (*++format_warning_cnt <= max_format_warnings)
742 char fmt_string[FMT_STRING_LEN_MAX + 1];
743 sys_warn (r, _("%s variable %s has invalid %s format %s."),
744 var_is_numeric (v) ? _("Numeric") : _("String"),
746 which == PRINT_FORMAT ? _("print") : _("write"),
747 fmt_to_string (&f, fmt_string));
749 if (*format_warning_cnt == max_format_warnings)
750 sys_warn (r, _("Suppressing further invalid format warnings."));
754 /* Sets the weighting variable in DICT to the variable
755 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
758 setup_weight (struct sfm_reader *r, int weight_idx,
759 struct variable **var_by_value_idx, struct dictionary *dict)
763 struct variable *weight_var
764 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
765 if (var_is_numeric (weight_var))
766 dict_set_weight (dict, weight_var);
768 sys_error (r, _("Weighting variable must be numeric."));
772 /* Reads a document record, type 6, from system file R, and sets up
773 the documents and n_documents fields in the associated
776 read_documents (struct sfm_reader *r, struct dictionary *dict)
781 if (dict_get_documents (dict) != NULL)
782 sys_error (r, _("Multiple type 6 (document) records."));
784 line_cnt = read_int (r);
786 sys_error (r, _("Number of document lines (%d) "
787 "must be greater than 0."), line_cnt);
789 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
790 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
791 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
792 dict_set_documents (dict, documents);
794 sys_error (r, _("Document line contains null byte."));
795 pool_free (r->pool, documents);
798 /* Read a type 7 extension record. */
800 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
801 struct sfm_read_info *info)
803 int subtype = read_int (r);
804 size_t size = read_int (r);
805 size_t count = read_int (r);
806 size_t bytes = size * count;
808 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
809 allows an extra byte for a null terminator, used by some
810 extension processing routines. */
811 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
812 sys_error (r, "Record type 7 subtype %d too large.", subtype);
817 read_machine_integer_info (r, size, count, info, dict);
821 read_machine_float_info (r, size, count);
825 /* Variable sets information. We don't use these yet.
826 They only apply to GUIs; see VARSETS on the APPLY
827 DICTIONARY command in SPSS documentation. */
831 /* DATE variable information. We don't use it yet, but we
837 read_mrsets (r, size, count, dict);
841 /* Used by the SPSS Data Entry software. */
845 read_display_parameters (r, size, count, dict);
849 read_long_var_name_map (r, size, count, dict);
853 read_long_string_map (r, size, count, dict);
857 /* Extended number of cases. Not important. */
861 read_data_file_attributes (r, size, count, dict);
865 read_variable_attributes (r, size, count, dict);
869 /* New in SPSS 16. Contains a single string that describes
870 the character encoding, e.g. "windows-1252". */
872 char *encoding = pool_calloc (r->pool, size, count + 1);
873 read_string (r, encoding, count + 1);
874 dict_set_encoding (dict, encoding);
879 /* New in SPSS 16. Encodes value labels for long string
881 read_long_string_value_labels (r, size, count, dict);
885 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"),
886 subtype, PACKAGE_BUGREPORT);
890 skip_bytes (r, bytes);
893 /* Read record type 7, subtype 3. */
895 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
896 struct sfm_read_info *info,
897 struct dictionary *dict)
899 int version_major = read_int (r);
900 int version_minor = read_int (r);
901 int version_revision = read_int (r);
902 int machine_code UNUSED = read_int (r);
903 int float_representation = read_int (r);
904 int compression_code UNUSED = read_int (r);
905 int integer_representation = read_int (r);
906 int character_code = read_int (r);
908 int expected_float_format;
909 int expected_integer_format;
911 if (size != 4 || count != 8)
912 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
916 /* Save version info. */
917 info->version_major = version_major;
918 info->version_minor = version_minor;
919 info->version_revision = version_revision;
921 /* Check floating point format. */
922 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
923 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
924 expected_float_format = 1;
925 else if (r->float_format == FLOAT_Z_LONG)
926 expected_float_format = 2;
927 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
928 expected_float_format = 3;
931 if (float_representation != expected_float_format)
932 sys_error (r, _("Floating-point representation indicated by "
933 "system file (%d) differs from expected (%d)."),
934 r->float_format, expected_float_format);
936 /* Check integer format. */
937 if (r->integer_format == INTEGER_MSB_FIRST)
938 expected_integer_format = 1;
939 else if (r->integer_format == INTEGER_LSB_FIRST)
940 expected_integer_format = 2;
943 if (integer_representation != expected_integer_format)
945 static const char *const endian[] = {N_("Little Endian"), N_("Big Endian")};
946 sys_warn (r, _("Integer format indicated by system file (%s) "
947 "differs from expected (%s)."),
948 gettext (endian[integer_representation == 1]),
949 gettext (endian[expected_integer_format == 1]));
954 Record 7 (20) provides a much more reliable way of
955 setting the encoding.
956 The character_code is used as a fallback only.
958 if ( NULL == dict_get_encoding (dict))
960 switch (character_code)
963 dict_set_encoding (dict, "EBCDIC-US");
967 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
968 respectively. However, there are known to be many files
969 in the wild with character code 2, yet have data which are
971 Therefore we ignore these values.
975 dict_set_encoding (dict, "MS_KANJI");
978 dict_set_encoding (dict, "UTF-7");
981 dict_set_encoding (dict, "UTF-8");
986 snprintf (enc, 100, "CP%d", character_code);
987 dict_set_encoding (dict, enc);
994 /* Read record type 7, subtype 4. */
996 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
998 double sysmis = read_float (r);
999 double highest = read_float (r);
1000 double lowest = read_float (r);
1002 if (size != 8 || count != 3)
1003 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
1006 if (sysmis != SYSMIS)
1007 sys_warn (r, _("File specifies unexpected value %g as %s."),
1010 if (highest != HIGHEST)
1011 sys_warn (r, _("File specifies unexpected value %g as %s."),
1012 highest, "HIGHEST");
1014 if (lowest != LOWEST)
1015 sys_warn (r, _("File specifies unexpected value %g as %s."),
1019 /* Read record type 7, subtype 7 or 19. */
1021 read_mrsets (struct sfm_reader *r, size_t size, size_t count,
1022 struct dictionary *dict)
1024 struct text_record *text;
1025 struct mrset *mrset;
1027 text = open_text_record (r, size * count);
1030 const char *name, *label, *counted;
1031 struct stringi_set var_names;
1032 size_t allocated_vars;
1036 mrset = xzalloc (sizeof *mrset);
1038 name = text_get_token (text, ss_cstr ("="), NULL);
1041 mrset->name = xstrdup (name);
1043 if (text_match (text, 'C'))
1045 mrset->type = MRSET_MC;
1046 if (!text_match (text, ' '))
1048 sys_warn (r, _("Missing space following 'C' at offset %zu "
1049 "in MRSETS record"), text_pos (text));
1053 else if (text_match (text, 'D'))
1055 mrset->type = MRSET_MD;
1056 mrset->cat_source = MRSET_VARLABELS;
1058 else if (text_match (text, 'E'))
1062 mrset->type = MRSET_MD;
1063 mrset->cat_source = MRSET_COUNTEDVALUES;
1064 if (!text_match (text, ' '))
1066 sys_warn (r, _("Missing space following 'E' at offset %zu "
1067 "in MRSETS record"), text_pos (text));
1071 number = text_get_token (text, ss_cstr (" "), NULL);
1072 if (!strcmp (number, "11"))
1073 mrset->label_from_var_label = true;
1074 else if (strcmp (number, "1"))
1075 sys_warn (r, _("Unexpected label source value \"%s\" "
1076 "following 'E' at offset %zu in MRSETS record"),
1077 number, text_pos (text));
1081 sys_warn (r, _("Missing 'C', 'D', or 'E' at offset %zu "
1082 "in MRSETS record."),
1087 if (mrset->type == MRSET_MD)
1089 counted = text_parse_counted_string (r, text);
1090 if (counted == NULL)
1094 label = text_parse_counted_string (r, text);
1097 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1099 stringi_set_init (&var_names);
1104 struct variable *var;
1105 const char *var_name;
1107 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1108 if (var_name == NULL)
1110 sys_warn (r, _("Missing new-line parsing variable names "
1111 "at offset %zu in MRSETS record."),
1116 var = lookup_var_by_short_name (dict, var_name);
1119 if (!stringi_set_insert (&var_names, var_name))
1121 sys_warn (r, _("Duplicate variable name %s "
1122 "at offset %zu in MRSETS record."),
1123 var_name, text_pos (text));
1127 if (mrset->label == NULL && mrset->label_from_var_label
1128 && var_has_label (var))
1129 mrset->label = xstrdup (var_get_label (var));
1132 && var_get_type (var) != var_get_type (mrset->vars[0]))
1134 sys_warn (r, _("MRSET %s contains both string and "
1135 "numeric variables."), name);
1138 width = MIN (width, var_get_width (var));
1140 if (mrset->n_vars >= allocated_vars)
1141 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1142 sizeof *mrset->vars);
1143 mrset->vars[mrset->n_vars++] = var;
1145 while (delimiter != '\n');
1147 if (mrset->n_vars < 2)
1149 sys_warn (r, _("MRSET %s has only %zu variables."), mrset->name,
1151 mrset_destroy (mrset);
1155 if (mrset->type == MRSET_MD)
1157 mrset->width = width;
1158 value_init (&mrset->counted, width);
1160 mrset->counted.f = strtod (counted, NULL);
1162 value_copy_str_rpad (&mrset->counted, width,
1163 (const uint8_t *) counted, ' ');
1166 dict_add_mrset (dict, mrset);
1169 mrset_destroy (mrset);
1170 close_text_record (r, text);
1173 /* Read record type 7, subtype 11, which specifies how variables
1174 should be displayed in GUI environments. */
1176 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
1177 struct dictionary *dict)
1180 bool includes_width;
1181 bool warned = false;
1186 sys_warn (r, _("Bad size %zu on extension 11."), size);
1187 skip_bytes (r, size * count);
1191 n_vars = dict_get_var_cnt (dict);
1192 if (count == 3 * n_vars)
1193 includes_width = true;
1194 else if (count == 2 * n_vars)
1195 includes_width = false;
1198 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
1200 skip_bytes (r, size * count);
1204 for (i = 0; i < n_vars; ++i)
1206 struct variable *v = dict_get_var (dict, i);
1207 int measure = read_int (r);
1208 int width = includes_width ? read_int (r) : 0;
1209 int align = read_int (r);
1211 /* SPSS 14 sometimes seems to set string variables' measure
1213 if (0 == measure && var_is_alpha (v))
1216 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1219 sys_warn (r, _("Invalid variable display parameters "
1220 "for variable %zu (%s). "
1221 "Default parameters substituted."),
1222 i, var_get_name (v));
1227 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1228 : measure == 2 ? MEASURE_ORDINAL
1230 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1231 : align == 1 ? ALIGN_RIGHT
1234 /* Older versions (SPSS 9.0) sometimes set the display
1235 width to zero. This causes confusion in the GUI, so
1236 only set the width if it is nonzero. */
1238 var_set_display_width (v, width);
1242 /* Reads record type 7, subtype 13, which gives the long name
1243 that corresponds to each short name. Modifies variable names
1244 in DICT accordingly. */
1246 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
1247 struct dictionary *dict)
1249 struct text_record *text;
1250 struct variable *var;
1253 text = open_text_record (r, size * count);
1254 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1257 size_t short_name_cnt;
1260 /* Validate long name. */
1261 if (!var_is_valid_name (long_name, false))
1263 sys_warn (r, _("Long variable mapping from %s to invalid "
1264 "variable name `%s'."),
1265 var_get_name (var), long_name);
1269 /* Identify any duplicates. */
1270 if (strcasecmp (var_get_short_name (var, 0), long_name)
1271 && dict_lookup_var (dict, long_name) != NULL)
1273 sys_warn (r, _("Duplicate long variable name `%s' "
1274 "within system file."), long_name);
1278 /* Renaming a variable may clear its short names, but we
1279 want to retain them, so we save them and re-set them
1281 short_name_cnt = var_get_short_name_cnt (var);
1282 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
1283 for (i = 0; i < short_name_cnt; i++)
1285 const char *s = var_get_short_name (var, i);
1286 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1289 /* Set long name. */
1290 dict_rename_var (dict, var, long_name);
1292 /* Restore short names. */
1293 for (i = 0; i < short_name_cnt; i++)
1295 var_set_short_name (var, i, short_names[i]);
1296 free (short_names[i]);
1300 close_text_record (r, text);
1301 r->has_long_var_names = true;
1304 /* Reads record type 7, subtype 14, which gives the real length
1305 of each very long string. Rearranges DICT accordingly. */
1307 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1308 struct dictionary *dict)
1310 struct text_record *text;
1311 struct variable *var;
1314 text = open_text_record (r, size * count);
1315 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1317 size_t idx = var_get_dict_index (var);
1323 length = strtol (length_s, NULL, 10);
1324 if (length < 1 || length > MAX_STRING)
1326 sys_warn (r, _("%s listed as string of invalid length %s "
1327 "in very length string record."),
1328 var_get_name (var), length_s);
1332 /* Check segments. */
1333 segment_cnt = sfm_width_to_segments (length);
1334 if (segment_cnt == 1)
1336 sys_warn (r, _("%s listed in very long string record with width %s, "
1337 "which requires only one segment."),
1338 var_get_name (var), length_s);
1341 if (idx + segment_cnt > dict_get_var_cnt (dict))
1342 sys_error (r, _("Very long string %s overflows dictionary."),
1343 var_get_name (var));
1345 /* Get the short names from the segments and check their
1347 for (i = 0; i < segment_cnt; i++)
1349 struct variable *seg = dict_get_var (dict, idx + i);
1350 int alloc_width = sfm_segment_alloc_width (length, i);
1351 int width = var_get_width (seg);
1354 var_set_short_name (var, i, var_get_short_name (seg, 0));
1355 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1356 sys_error (r, _("Very long string with width %ld has segment %d "
1357 "of width %d (expected %d)"),
1358 length, i, width, alloc_width);
1360 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1361 var_set_width (var, length);
1363 close_text_record (r, text);
1364 dict_compact_values (dict);
1367 /* Reads value labels from sysfile H and inserts them into the
1368 associated dictionary. */
1370 read_value_labels (struct sfm_reader *r,
1371 struct dictionary *dict, struct variable **var_by_value_idx)
1373 struct pool *subpool;
1377 uint8_t raw_value[8]; /* Value as uninterpreted bytes. */
1378 union value value; /* Value. */
1379 char *label; /* Null-terminated label string. */
1382 struct label *labels = NULL;
1383 int label_cnt; /* Number of labels. */
1385 struct variable **var = NULL; /* Associated variables. */
1386 int var_cnt; /* Number of associated variables. */
1387 int max_width; /* Maximum width of string variables. */
1391 subpool = pool_create_subpool (r->pool);
1393 /* Read the type 3 record and record its contents. We can't do
1394 much with the data yet because we don't know whether it is
1395 of numeric or string type. */
1397 /* Read number of labels. */
1398 label_cnt = read_int (r);
1400 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1402 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1407 /* Read each value/label tuple into labels[]. */
1408 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1409 for (i = 0; i < label_cnt; i++)
1411 struct label *label = labels + i;
1412 unsigned char label_len;
1416 read_bytes (r, label->raw_value, sizeof label->raw_value);
1418 /* Read label length. */
1419 read_bytes (r, &label_len, sizeof label_len);
1420 padded_len = ROUND_UP (label_len + 1, 8);
1422 /* Read label, padding. */
1423 label->label = pool_alloc (subpool, padded_len + 1);
1424 read_bytes (r, label->label, padded_len - 1);
1425 label->label[label_len] = 0;
1428 /* Now, read the type 4 record that has the list of variables
1429 to which the value labels are to be applied. */
1431 /* Read record type of type 4 record. */
1432 if (read_int (r) != 4)
1433 sys_error (r, _("Variable index record (type 4) does not immediately "
1434 "follow value label record (type 3) as it should."));
1436 /* Read number of variables associated with value label from type 4
1438 var_cnt = read_int (r);
1439 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1440 sys_error (r, _("Number of variables associated with a value label (%d) "
1441 "is not between 1 and the number of variables (%zu)."),
1442 var_cnt, dict_get_var_cnt (dict));
1444 /* Read the list of variables. */
1445 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1447 for (i = 0; i < var_cnt; i++)
1449 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1450 if (var_get_width (var[i]) > 8)
1451 sys_error (r, _("Value labels may not be added to long string "
1452 "variables (e.g. %s) using records types 3 and 4."),
1453 var_get_name (var[i]));
1454 max_width = MAX (max_width, var_get_width (var[i]));
1457 /* Type check the variables. */
1458 for (i = 1; i < var_cnt; i++)
1459 if (var_get_type (var[i]) != var_get_type (var[0]))
1460 sys_error (r, _("Variables associated with value label are not all of "
1461 "identical type. Variable %s is %s, but variable "
1463 var_get_name (var[0]),
1464 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1465 var_get_name (var[i]),
1466 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1468 /* Fill in labels[].value, now that we know the desired type. */
1469 for (i = 0; i < label_cnt; i++)
1471 struct label *label = labels + i;
1473 value_init_pool (subpool, &label->value, max_width);
1474 if (var_is_alpha (var[0]))
1475 u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
1476 label->raw_value, sizeof label->raw_value, ' ');
1478 label->value.f = float_get_double (r->float_format, label->raw_value);
1481 /* Assign the `value_label's to each variable. */
1482 for (i = 0; i < var_cnt; i++)
1484 struct variable *v = var[i];
1487 /* Add each label to the variable. */
1488 for (j = 0; j < label_cnt; j++)
1490 struct label *label = &labels[j];
1491 if (!var_add_value_label (v, &label->value, label->label))
1493 if (var_is_numeric (var[0]))
1494 sys_warn (r, _("Duplicate value label for %g on %s."),
1495 label->value.f, var_get_name (v));
1497 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1498 max_width, value_str (&label->value, max_width),
1504 pool_destroy (subpool);
1507 /* Reads a set of custom attributes from TEXT into ATTRS.
1508 ATTRS may be a null pointer, in which case the attributes are
1509 read but discarded. */
1511 read_attributes (struct sfm_reader *r, struct text_record *text,
1512 struct attrset *attrs)
1516 struct attribute *attr;
1520 /* Parse the key. */
1521 key = text_get_token (text, ss_cstr ("("), NULL);
1525 attr = attribute_create (key);
1526 for (index = 1; ; index++)
1528 /* Parse the value. */
1532 value = text_get_token (text, ss_cstr ("\n"), NULL);
1535 text_warn (r, text, _("Error parsing attribute value %s[%d]"),
1540 length = strlen (value);
1541 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1543 value[length - 1] = '\0';
1544 attribute_add_value (attr, value + 1);
1549 _("Attribute value %s[%d] is not quoted: %s"),
1551 attribute_add_value (attr, value);
1554 /* Was this the last value for this attribute? */
1555 if (text_match (text, ')'))
1559 attrset_add (attrs, attr);
1561 attribute_destroy (attr);
1563 while (!text_match (text, '/'));
1566 /* Reads record type 7, subtype 17, which lists custom
1567 attributes on the data file. */
1569 read_data_file_attributes (struct sfm_reader *r,
1570 size_t size, size_t count,
1571 struct dictionary *dict)
1573 struct text_record *text = open_text_record (r, size * count);
1574 read_attributes (r, text, dict_get_attributes (dict));
1575 close_text_record (r, text);
1579 skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels)
1583 for (i = 0; i < n_labels; i++)
1585 size_t value_length, label_length;
1587 value_length = read_int (r);
1588 skip_bytes (r, value_length);
1589 label_length = read_int (r);
1590 skip_bytes (r, label_length);
1595 read_long_string_value_labels (struct sfm_reader *r,
1596 size_t size, size_t count,
1597 struct dictionary *d)
1599 const off_t start = ftello (r->file);
1600 while (ftello (r->file) - start < size * count)
1602 char var_name[VAR_NAME_LEN + 1];
1610 var_name_len = read_int (r);
1611 if (var_name_len > VAR_NAME_LEN)
1612 sys_error (r, _("Variable name length in long string value label "
1613 "record (%d) exceeds %d-byte limit."),
1614 var_name_len, VAR_NAME_LEN);
1615 read_string (r, var_name, var_name_len + 1);
1616 width = read_int (r);
1617 n_labels = read_int (r);
1619 v = dict_lookup_var (d, var_name);
1622 sys_warn (r, _("Ignoring long string value record for "
1623 "unknown variable %s."), var_name);
1624 skip_long_string_value_labels (r, n_labels);
1627 if (var_is_numeric (v))
1629 sys_warn (r, _("Ignoring long string value record for "
1630 "numeric variable %s."), var_name);
1631 skip_long_string_value_labels (r, n_labels);
1634 if (width != var_get_width (v))
1636 sys_warn (r, _("Ignoring long string value record for variable %s "
1637 "because the record's width (%d) does not match the "
1638 "variable's width (%d)"),
1639 var_name, width, var_get_width (v));
1640 skip_long_string_value_labels (r, n_labels);
1645 value_init_pool (r->pool, &value, width);
1646 for (i = 0; i < n_labels; i++)
1648 size_t value_length, label_length;
1653 value_length = read_int (r);
1654 if (value_length == width)
1655 read_bytes (r, value_str_rw (&value, width), width);
1658 sys_warn (r, _("Ignoring long string value %zu for variable %s, "
1659 "with width %d, that has bad value width %zu."),
1660 i, var_get_name (v), width, value_length);
1661 skip_bytes (r, value_length);
1666 label_length = read_int (r);
1667 read_string (r, label, MIN (sizeof label, label_length + 1));
1668 if (label_length >= sizeof label)
1670 /* Skip and silently ignore label text after the
1671 first 255 bytes. The maximum documented length
1672 of a label is 120 bytes so this is more than
1674 skip_bytes (r, sizeof label - (label_length + 1));
1677 if (!skip && !var_add_value_label (v, &value, label))
1678 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1679 width, value_str (&value, width), var_get_name (v));
1685 /* Reads record type 7, subtype 18, which lists custom
1686 attributes on individual variables. */
1688 read_variable_attributes (struct sfm_reader *r,
1689 size_t size, size_t count,
1690 struct dictionary *dict)
1692 struct text_record *text = open_text_record (r, size * count);
1695 struct variable *var;
1696 if (!text_read_short_name (r, dict, text, ss_cstr (":"), &var))
1698 read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1700 close_text_record (r, text);
1706 static void partial_record (struct sfm_reader *r)
1709 static void read_error (struct casereader *, const struct sfm_reader *);
1711 static bool read_case_number (struct sfm_reader *, double *);
1712 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1713 static int read_opcode (struct sfm_reader *);
1714 static bool read_compressed_number (struct sfm_reader *, double *);
1715 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1716 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1717 static bool skip_whole_strings (struct sfm_reader *, size_t);
1719 /* Reads and returns one case from READER's file. Returns a null
1720 pointer if not successful. */
1721 static struct ccase *
1722 sys_file_casereader_read (struct casereader *reader, void *r_)
1724 struct sfm_reader *r = r_;
1725 struct ccase *volatile c;
1731 c = case_create (r->proto);
1732 if (setjmp (r->bail_out))
1734 casereader_force_error (reader);
1739 for (i = 0; i < r->sfm_var_cnt; i++)
1741 struct sfm_var *sv = &r->sfm_vars[i];
1742 union value *v = case_data_rw_idx (c, sv->case_index);
1744 if (sv->var_width == 0)
1746 if (!read_case_number (r, &v->f))
1751 uint8_t *s = value_str_rw (v, sv->var_width);
1752 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1754 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1764 if (r->case_cnt != -1)
1765 read_error (reader, r);
1769 /* Issues an error that R ends in a partial record. */
1771 partial_record (struct sfm_reader *r)
1773 sys_error (r, _("File ends in partial case."));
1776 /* Issues an error that an unspecified error occurred SFM, and
1779 read_error (struct casereader *r, const struct sfm_reader *sfm)
1781 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1782 casereader_force_error (r);
1785 /* Reads a number from R and stores its value in *D.
1786 If R is compressed, reads a compressed number;
1787 otherwise, reads a number in the regular way.
1788 Returns true if successful, false if end of file is
1789 reached immediately. */
1791 read_case_number (struct sfm_reader *r, double *d)
1796 if (!try_read_bytes (r, number, sizeof number))
1798 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1802 return read_compressed_number (r, d);
1805 /* Reads LENGTH string bytes from R into S.
1806 Always reads a multiple of 8 bytes; if LENGTH is not a
1807 multiple of 8, then extra bytes are read and discarded without
1809 Reads compressed strings if S is compressed.
1810 Returns true if successful, false if end of file is
1811 reached immediately. */
1813 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
1815 size_t whole = ROUND_DOWN (length, 8);
1816 size_t partial = length % 8;
1820 if (!read_whole_strings (r, s, whole))
1827 if (!read_whole_strings (r, bounce, sizeof bounce))
1833 memcpy (s + whole, bounce, partial);
1839 /* Reads and returns the next compression opcode from R. */
1841 read_opcode (struct sfm_reader *r)
1843 assert (r->compressed);
1847 if (r->opcode_idx >= sizeof r->opcodes)
1849 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1853 opcode = r->opcodes[r->opcode_idx++];
1860 /* Reads a compressed number from R and stores its value in D.
1861 Returns true if successful, false if end of file is
1862 reached immediately. */
1864 read_compressed_number (struct sfm_reader *r, double *d)
1866 int opcode = read_opcode (r);
1874 *d = read_float (r);
1878 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
1879 if (!r->corruption_warning)
1881 r->corruption_warning = true;
1882 sys_warn (r, _("Possible compressed data corruption: "
1883 "compressed spaces appear in numeric field."));
1892 *d = opcode - r->bias;
1899 /* Reads a compressed 8-byte string segment from R and stores it
1901 Returns true if successful, false if end of file is
1902 reached immediately. */
1904 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
1906 int opcode = read_opcode (r);
1914 read_bytes (r, dst, 8);
1918 memset (dst, ' ', 8);
1923 double value = opcode - r->bias;
1924 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
1927 /* This has actually been seen "in the wild". The submitter of the
1928 file that showed that the contents decoded as spaces, but they
1929 were at the end of the field so it's possible that the null
1930 bytes just acted as null terminators. */
1932 else if (!r->corruption_warning)
1934 r->corruption_warning = true;
1935 sys_warn (r, _("Possible compressed data corruption: "
1936 "string contains compressed integer (opcode %d)"),
1946 /* Reads LENGTH string bytes from R into S.
1947 LENGTH must be a multiple of 8.
1948 Reads compressed strings if S is compressed.
1949 Returns true if successful, false if end of file is
1950 reached immediately. */
1952 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
1954 assert (length % 8 == 0);
1956 return try_read_bytes (r, s, length);
1960 for (ofs = 0; ofs < length; ofs += 8)
1961 if (!read_compressed_string (r, s + ofs))
1971 /* Skips LENGTH string bytes from R.
1972 LENGTH must be a multiple of 8.
1973 (LENGTH is also limited to 1024, but that's only because the
1974 current caller never needs more than that many bytes.)
1975 Returns true if successful, false if end of file is
1976 reached immediately. */
1978 skip_whole_strings (struct sfm_reader *r, size_t length)
1980 uint8_t buffer[1024];
1981 assert (length < sizeof buffer);
1982 return read_whole_strings (r, buffer, length);
1985 /* Creates and returns a table that can be used for translating a value
1986 index into a case to a "struct variable *" for DICT. Multiple
1987 system file fields reference variables this way.
1989 This table must be created before processing the very long
1990 string extension record, because that record causes some
1991 values to be deleted from the case and the dictionary to be
1993 static struct variable **
1994 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
1996 struct variable **var_by_value_idx;
2000 var_by_value_idx = pool_nmalloc (r->pool,
2001 r->oct_cnt, sizeof *var_by_value_idx);
2002 for (i = 0; i < dict_get_var_cnt (dict); i++)
2004 struct variable *v = dict_get_var (dict, i);
2005 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
2008 var_by_value_idx[value_idx++] = v;
2009 for (j = 1; j < nv; j++)
2010 var_by_value_idx[value_idx++] = NULL;
2012 assert (value_idx == r->oct_cnt);
2014 return var_by_value_idx;
2017 /* Returns the "struct variable" corresponding to the given
2018 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
2020 static struct variable *
2021 lookup_var_by_value_idx (struct sfm_reader *r,
2022 struct variable **var_by_value_idx, int value_idx)
2024 struct variable *var;
2026 if (value_idx < 1 || value_idx > r->oct_cnt)
2027 sys_error (r, _("Variable index %d not in valid range 1...%d."),
2028 value_idx, r->oct_cnt);
2030 var = var_by_value_idx[value_idx - 1];
2032 sys_error (r, _("Variable index %d refers to long string "
2039 /* Returns the variable in D with the given SHORT_NAME,
2040 or a null pointer if there is none. */
2041 static struct variable *
2042 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
2044 struct variable *var;
2048 /* First try looking up by full name. This often succeeds. */
2049 var = dict_lookup_var (d, short_name);
2050 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
2053 /* Iterate through the whole dictionary as a fallback. */
2054 var_cnt = dict_get_var_cnt (d);
2055 for (i = 0; i < var_cnt; i++)
2057 var = dict_get_var (d, i);
2058 if (!strcasecmp (var_get_short_name (var, 0), short_name))
2065 /* Helpers for reading records that contain structured text
2068 /* Maximum number of warnings to issue for a single text
2070 #define MAX_TEXT_WARNINGS 5
2075 struct substring buffer; /* Record contents. */
2076 size_t pos; /* Current position in buffer. */
2077 int n_warnings; /* Number of warnings issued or suppressed. */
2080 /* Reads SIZE bytes into a text record for R,
2081 and returns the new text record. */
2082 static struct text_record *
2083 open_text_record (struct sfm_reader *r, size_t size)
2085 struct text_record *text = pool_alloc (r->pool, sizeof *text);
2086 char *buffer = pool_malloc (r->pool, size + 1);
2087 read_bytes (r, buffer, size);
2088 text->buffer = ss_buffer (buffer, size);
2090 text->n_warnings = 0;
2094 /* Closes TEXT, frees its storage, and issues a final warning
2095 about suppressed warnings if necesary. */
2097 close_text_record (struct sfm_reader *r, struct text_record *text)
2099 if (text->n_warnings > MAX_TEXT_WARNINGS)
2100 sys_warn (r, _("Suppressed %d additional related warnings."),
2101 text->n_warnings - MAX_TEXT_WARNINGS);
2102 pool_free (r->pool, ss_data (text->buffer));
2105 /* Reads a variable=value pair from TEXT.
2106 Looks up the variable in DICT and stores it into *VAR.
2107 Stores a null-terminated value into *VALUE. */
2109 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2110 struct text_record *text,
2111 struct variable **var, char **value)
2115 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2118 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2122 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2123 ss_buffer ("\t\0", 2));
2131 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2132 struct text_record *text, struct substring delimiters,
2133 struct variable **var)
2135 char *short_name = text_get_token (text, delimiters, NULL);
2136 if (short_name == NULL)
2139 *var = lookup_var_by_short_name (dict, short_name);
2141 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2146 /* Displays a warning for the current file position, limiting the
2147 number to MAX_TEXT_WARNINGS for TEXT. */
2149 text_warn (struct sfm_reader *r, struct text_record *text,
2150 const char *format, ...)
2152 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2156 va_start (args, format);
2157 sys_msg (r, MW, format, args);
2163 text_get_token (struct text_record *text, struct substring delimiters,
2166 struct substring token;
2169 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2172 end = &ss_data (token)[ss_length (token)];
2173 if (delimiter != NULL)
2176 return ss_data (token);
2179 /* Reads a integer value expressed in decimal, then a space, then a string that
2180 consists of exactly as many bytes as specified by the integer, then a space,
2181 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2182 buffer (so the caller should not free the string). */
2184 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2194 int c = text->buffer.string[text->pos];
2195 if (c < '0' || c > '9')
2197 n = (n * 10) + (c - '0');
2200 if (start == text->pos)
2202 sys_warn (r, _("Expecting digit at offset %zu in MRSETS record."),
2207 if (!text_match (text, ' '))
2209 sys_warn (r, _("Expecting space at offset %zu in MRSETS record."),
2214 if (text->pos + n > text->buffer.length)
2216 sys_warn (r, _("%zu-byte string starting at offset %zu "
2217 "exceeds record length %zu."),
2218 n, text->pos, text->buffer.length);
2222 s = &text->buffer.string[text->pos];
2226 _("Expecting space at offset %zu following %zu-byte string."),
2236 text_match (struct text_record *text, char c)
2238 if (text->buffer.string[text->pos] == c)
2247 /* Returns the current byte offset inside the TEXT's string. */
2249 text_pos (const struct text_record *text)
2256 /* Displays a corruption message. */
2258 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
2263 ds_init_empty (&text);
2264 ds_put_format (&text, "\"%s\" near offset 0x%llx: ",
2265 fh_get_file_name (r->fh), (long long int) ftello (r->file));
2266 ds_put_vformat (&text, format, args);
2268 m.category = msg_class_to_category (class);
2269 m.severity = msg_class_to_severity (class);
2270 m.where.file_name = NULL;
2271 m.where.line_number = 0;
2272 m.text = ds_cstr (&text);
2277 /* Displays a warning for the current file position. */
2279 sys_warn (struct sfm_reader *r, const char *format, ...)
2283 va_start (args, format);
2284 sys_msg (r, MW, format, args);
2288 /* Displays an error for the current file position,
2289 marks it as in an error state,
2290 and aborts reading it using longjmp. */
2292 sys_error (struct sfm_reader *r, const char *format, ...)
2296 va_start (args, format);
2297 sys_msg (r, ME, format, args);
2301 longjmp (r->bail_out, 1);
2304 /* Reads BYTE_CNT bytes into BUF.
2305 Returns true if exactly BYTE_CNT bytes are successfully read.
2306 Aborts if an I/O error or a partial read occurs.
2307 If EOF_IS_OK, then an immediate end-of-file causes false to be
2308 returned; otherwise, immediate end-of-file causes an abort
2311 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2312 void *buf, size_t byte_cnt)
2314 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2315 if (bytes_read == byte_cnt)
2317 else if (ferror (r->file))
2318 sys_error (r, _("System error: %s."), strerror (errno));
2319 else if (!eof_is_ok || bytes_read != 0)
2320 sys_error (r, _("Unexpected end of file."));
2325 /* Reads BYTE_CNT into BUF.
2326 Aborts upon I/O error or if end-of-file is encountered. */
2328 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2330 read_bytes_internal (r, false, buf, byte_cnt);
2333 /* Reads BYTE_CNT bytes into BUF.
2334 Returns true if exactly BYTE_CNT bytes are successfully read.
2335 Returns false if an immediate end-of-file is encountered.
2336 Aborts if an I/O error or a partial read occurs. */
2338 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2340 return read_bytes_internal (r, true, buf, byte_cnt);
2343 /* Reads a 32-bit signed integer from R and returns its value in
2346 read_int (struct sfm_reader *r)
2349 read_bytes (r, integer, sizeof integer);
2350 return integer_get (r->integer_format, integer, sizeof integer);
2353 /* Reads a 64-bit floating-point number from R and returns its
2354 value in host format. */
2356 read_float (struct sfm_reader *r)
2359 read_bytes (r, number, sizeof number);
2360 return float_get_double (r->float_format, number);
2363 /* Reads exactly SIZE - 1 bytes into BUFFER
2364 and stores a null byte into BUFFER[SIZE - 1]. */
2366 read_string (struct sfm_reader *r, char *buffer, size_t size)
2369 read_bytes (r, buffer, size - 1);
2370 buffer[size - 1] = '\0';
2373 /* Skips BYTES bytes forward in R. */
2375 skip_bytes (struct sfm_reader *r, size_t bytes)
2380 size_t chunk = MIN (sizeof buffer, bytes);
2381 read_bytes (r, buffer, chunk);
2386 static const struct casereader_class sys_file_casereader_class =
2388 sys_file_casereader_read,
2389 sys_file_casereader_destroy,