1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/mrset.h"
38 #include "data/short-names.h"
39 #include "data/value-labels.h"
40 #include "data/value.h"
41 #include "data/variable.h"
42 #include "libpspp/array.h"
43 #include "libpspp/assertion.h"
44 #include "libpspp/compiler.h"
45 #include "libpspp/hash.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/minmax.h"
56 #include "gl/unlocked-io.h"
57 #include "gl/xalloc.h"
61 #define _(msgid) gettext (msgid)
62 #define N_(msgid) (msgid)
64 /* System file reader. */
67 /* Resource tracking. */
68 struct pool *pool; /* All system file state. */
69 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct file_handle *fh; /* File handle. */
73 struct fh_lock *lock; /* Mutual exclusion for file handle. */
74 FILE *file; /* File stream. */
75 bool error; /* I/O or corruption error? */
76 struct caseproto *proto; /* Format of output cases. */
79 enum integer_format integer_format; /* On-disk integer format. */
80 enum float_format float_format; /* On-disk floating point format. */
81 int oct_cnt; /* Number of 8-byte units per case. */
82 struct sfm_var *sfm_vars; /* Variables. */
83 size_t sfm_var_cnt; /* Number of variables. */
84 casenumber case_cnt; /* Number of cases */
85 bool has_long_var_names; /* File has a long variable name map */
88 bool compressed; /* File is compressed? */
89 double bias; /* Compression bias, usually 100.0. */
90 uint8_t opcodes[8]; /* Current block of opcodes. */
91 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
92 bool corruption_warning; /* Warned about possible corruption? */
95 static const struct casereader_class sys_file_casereader_class;
97 static bool close_reader (struct sfm_reader *);
99 static struct variable **make_var_by_value_idx (struct sfm_reader *,
100 struct dictionary *);
101 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
104 static struct variable *lookup_var_by_short_name (struct dictionary *,
105 const char *short_name);
107 static void sys_msg (struct sfm_reader *r, int class,
108 const char *format, va_list args)
109 PRINTF_FORMAT (3, 0);
110 static void sys_warn (struct sfm_reader *, const char *, ...)
111 PRINTF_FORMAT (2, 3);
112 static void sys_error (struct sfm_reader *, const char *, ...)
116 static void read_bytes (struct sfm_reader *, void *, size_t);
117 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
118 static int read_int (struct sfm_reader *);
119 static double read_float (struct sfm_reader *);
120 static void read_string (struct sfm_reader *, char *, size_t);
121 static void skip_bytes (struct sfm_reader *, size_t);
123 static struct text_record *open_text_record (struct sfm_reader *, size_t size);
124 static void close_text_record (struct sfm_reader *r,
125 struct text_record *);
126 static bool read_variable_to_value_pair (struct sfm_reader *,
128 struct text_record *,
129 struct variable **var, char **value);
130 static void text_warn (struct sfm_reader *r, struct text_record *text,
131 const char *format, ...)
132 PRINTF_FORMAT (3, 4);
133 static char *text_get_token (struct text_record *,
134 struct substring delimiters, char *delimiter);
135 static bool text_match (struct text_record *, char c);
136 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
137 struct text_record *,
138 struct substring delimiters,
140 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
141 struct text_record *,
142 struct substring delimiters,
144 static const char *text_parse_counted_string (struct sfm_reader *,
145 struct text_record *);
146 static size_t text_pos (const struct text_record *);
148 static bool close_reader (struct sfm_reader *r);
150 /* Dictionary reader. */
158 static void read_header (struct sfm_reader *, struct dictionary *,
159 int *weight_idx, int *claimed_oct_cnt,
160 struct sfm_read_info *);
161 static void read_variable_record (struct sfm_reader *, struct dictionary *,
162 int *format_warning_cnt);
163 static void parse_format_spec (struct sfm_reader *, unsigned int,
164 enum which_format, struct variable *,
165 int *format_warning_cnt);
166 static void setup_weight (struct sfm_reader *, int weight_idx,
167 struct variable **var_by_value_idx,
168 struct dictionary *);
169 static void read_documents (struct sfm_reader *, struct dictionary *);
170 static void read_value_labels (struct sfm_reader *, struct dictionary *,
171 struct variable **var_by_value_idx);
173 static void read_extension_record (struct sfm_reader *, struct dictionary *,
174 struct sfm_read_info *);
175 static void read_machine_integer_info (struct sfm_reader *,
176 size_t size, size_t count,
177 struct sfm_read_info *,
180 static void read_machine_float_info (struct sfm_reader *,
181 size_t size, size_t count);
182 static void read_mrsets (struct sfm_reader *, size_t size, size_t count,
183 struct dictionary *);
184 static void read_display_parameters (struct sfm_reader *,
185 size_t size, size_t count,
186 struct dictionary *);
187 static void read_long_var_name_map (struct sfm_reader *,
188 size_t size, size_t count,
189 struct dictionary *);
190 static void read_long_string_map (struct sfm_reader *,
191 size_t size, size_t count,
192 struct dictionary *);
193 static void read_data_file_attributes (struct sfm_reader *,
194 size_t size, size_t count,
195 struct dictionary *);
196 static void read_variable_attributes (struct sfm_reader *,
197 size_t size, size_t count,
198 struct dictionary *);
199 static void read_long_string_value_labels (struct sfm_reader *,
200 size_t size, size_t count,
201 struct dictionary *);
203 /* Convert all the strings in DICT from the dict encoding to UTF8 */
205 recode_strings (struct dictionary *dict)
209 const char *enc = dict_get_encoding (dict);
212 enc = get_default_encoding ();
214 for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
216 /* Convert the long variable name */
217 struct variable *var = dict_get_var (dict, i);
218 const char *native_name = var_get_name (var);
219 char *utf8_name = recode_string (UTF8, enc, native_name, -1);
220 if ( 0 != strcmp (utf8_name, native_name))
222 if ( NULL == dict_lookup_var (dict, utf8_name))
223 dict_rename_var (dict, var, utf8_name);
226 _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name);
231 /* Convert the variable label */
232 if (var_has_label (var))
234 char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
235 var_set_label (var, utf8_label);
239 if (var_has_value_labels (var))
241 const struct val_lab *vl = NULL;
242 const struct val_labs *vlabs = var_get_value_labels (var);
244 for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl))
246 const union value *val = val_lab_get_value (vl);
247 const char *label = val_lab_get_label (vl);
248 char *new_label = NULL;
250 new_label = recode_string (UTF8, enc, label, -1);
252 var_replace_value_label (var, val, new_label);
259 /* Opens the system file designated by file handle FH for
260 reading. Reads the system file's dictionary into *DICT.
261 If INFO is non-null, then it receives additional info about the
264 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
265 struct sfm_read_info *volatile info)
267 struct sfm_reader *volatile r = NULL;
268 struct variable **var_by_value_idx;
269 struct sfm_read_info local_info;
270 int format_warning_cnt = 0;
275 *dict = dict_create ();
277 /* Create and initialize reader. */
278 r = pool_create_container (struct sfm_reader, pool);
284 r->has_long_var_names = false;
285 r->opcode_idx = sizeof r->opcodes;
286 r->corruption_warning = false;
288 /* TRANSLATORS: this fragment will be interpolated into
289 messages in fh_lock() that identify types of files. */
290 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
294 r->file = fn_open (fh_get_file_name (fh), "rb");
297 msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
298 fh_get_file_name (r->fh), strerror (errno));
302 /* Initialize info. */
305 memset (info, 0, sizeof *info);
307 if (setjmp (r->bail_out))
312 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
314 /* Read all the variable definition records. */
315 rec_type = read_int (r);
316 while (rec_type == 2)
318 read_variable_record (r, *dict, &format_warning_cnt);
319 rec_type = read_int (r);
322 /* Figure out the case format. */
323 var_by_value_idx = make_var_by_value_idx (r, *dict);
324 setup_weight (r, weight_idx, var_by_value_idx, *dict);
326 /* Read all the rest of the dictionary records. */
327 while (rec_type != 999)
332 read_value_labels (r, *dict, var_by_value_idx);
336 sys_error (r, _("Misplaced type 4 record."));
339 read_documents (r, *dict);
343 read_extension_record (r, *dict, info);
347 sys_error (r, _("Unrecognized record type %d."), rec_type);
349 rec_type = read_int (r);
353 if ( ! r->has_long_var_names )
356 for (i = 0; i < dict_get_var_cnt (*dict); i++)
358 struct variable *var = dict_get_var (*dict, i);
359 char short_name[SHORT_NAME_LEN + 1];
360 char long_name[SHORT_NAME_LEN + 1];
362 strcpy (short_name, var_get_name (var));
364 strcpy (long_name, short_name);
365 str_lowercase (long_name);
367 /* Set long name. Renaming a variable may clear the short
368 name, but we want to retain it, so re-set it
370 dict_rename_var (*dict, var, long_name);
371 var_set_short_name (var, 0, short_name);
374 r->has_long_var_names = true;
377 recode_strings (*dict);
379 /* Read record 999 data, which is just filler. */
382 /* Warn if the actual amount of data per case differs from the
383 amount that the header claims. SPSS version 13 gets this
384 wrong when very long strings are involved, so don't warn in
386 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
387 && info->version_major != 13)
388 sys_warn (r, _("File header claims %d variable positions but "
389 "%d were read from file."),
390 claimed_oct_cnt, r->oct_cnt);
392 /* Create an index of dictionary variable widths for
393 sfm_read_case to use. We cannot use the `struct variable's
394 from the dictionary we created, because the caller owns the
395 dictionary and may destroy or modify its variables. */
396 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
397 pool_register (r->pool, free, r->sfm_vars);
398 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
400 pool_free (r->pool, var_by_value_idx);
401 return casereader_create_sequential
403 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
404 &sys_file_casereader_class, r);
408 dict_destroy (*dict);
413 /* Closes a system file after we're done with it.
414 Returns true if an I/O error has occurred on READER, false
417 close_reader (struct sfm_reader *r)
426 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
428 msg (ME, _("Error closing system file \"%s\": %s."),
429 fh_get_file_name (r->fh), strerror (errno));
439 pool_destroy (r->pool);
444 /* Destroys READER. */
446 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
448 struct sfm_reader *r = r_;
452 /* Returns true if FILE is an SPSS system file,
455 sfm_detect (FILE *file)
459 if (fread (rec_type, 4, 1, file) != 1)
463 return !strcmp ("$FL2", rec_type);
466 /* Reads the global header of the system file.
467 Sets DICT's file label to the system file's label.
468 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
469 or to the value index of the weight variable otherwise.
470 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
471 per case that the file claims to have (although it is not
473 Initializes INFO with header information. */
475 read_header (struct sfm_reader *r, struct dictionary *dict,
476 int *weight_idx, int *claimed_oct_cnt,
477 struct sfm_read_info *info)
480 char eye_catcher[61];
481 uint8_t raw_layout_code[4];
483 char creation_date[10];
484 char creation_time[9];
486 struct substring file_label_ss;
487 struct substring product;
489 read_string (r, rec_type, sizeof rec_type);
490 read_string (r, eye_catcher, sizeof eye_catcher);
492 if (strcmp ("$FL2", rec_type) != 0)
493 sys_error (r, _("This is not an SPSS system file."));
495 /* Identify integer format. */
496 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
497 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
499 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
501 || (r->integer_format != INTEGER_MSB_FIRST
502 && r->integer_format != INTEGER_LSB_FIRST))
503 sys_error (r, _("This is not an SPSS system file."));
505 *claimed_oct_cnt = read_int (r);
506 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
507 *claimed_oct_cnt = -1;
509 r->compressed = read_int (r) != 0;
511 *weight_idx = read_int (r);
513 r->case_cnt = read_int (r);
514 if ( r->case_cnt > INT_MAX / 2)
518 /* Identify floating-point format and obtain compression bias. */
519 read_bytes (r, raw_bias, sizeof raw_bias);
520 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
522 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
524 if (memcmp (raw_bias, zero_bias, 8))
525 sys_warn (r, _("Compression bias is not the usual "
526 "value of 100, or system file uses unrecognized "
527 "floating-point format."));
530 /* Some software is known to write all-zeros to this
531 field. Such software also writes floating-point
532 numbers in the format that we expect by default
533 (it seems that all software most likely does, in
534 reality), so don't warn in this case. */
537 if (r->integer_format == INTEGER_MSB_FIRST)
538 r->float_format = FLOAT_IEEE_DOUBLE_BE;
540 r->float_format = FLOAT_IEEE_DOUBLE_LE;
542 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
544 read_string (r, creation_date, sizeof creation_date);
545 read_string (r, creation_time, sizeof creation_time);
546 read_string (r, file_label, sizeof file_label);
549 file_label_ss = ss_cstr (file_label);
550 ss_trim (&file_label_ss, ss_cstr (" "));
551 if (!ss_is_empty (file_label_ss))
553 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
554 dict_set_label (dict, ss_data (file_label_ss));
557 strcpy (info->creation_date, creation_date);
558 strcpy (info->creation_time, creation_time);
559 info->integer_format = r->integer_format;
560 info->float_format = r->float_format;
561 info->compressed = r->compressed;
562 info->case_cnt = r->case_cnt;
564 product = ss_cstr (eye_catcher);
565 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
566 ss_trim (&product, ss_cstr (" "));
567 str_copy_buf_trunc (info->product, sizeof info->product,
568 ss_data (product), ss_length (product));
571 /* Reads a variable (type 2) record from R and adds the
572 corresponding variable to DICT.
573 Also skips past additional variable records for long string
576 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
577 int *format_warning_cnt)
580 int has_variable_label;
581 int missing_value_code;
586 struct variable *var;
589 width = read_int (r);
590 has_variable_label = read_int (r);
591 missing_value_code = read_int (r);
592 print_format = read_int (r);
593 write_format = read_int (r);
594 read_string (r, name, sizeof name);
595 name[strcspn (name, " ")] = '\0';
597 /* Check variable name. */
598 if (name[0] == '$' || name[0] == '#')
599 sys_error (r, "Variable name begins with invalid character `%c'.",
601 if (!var_is_plausible_name (name, false))
602 sys_error (r, _("Invalid variable name `%s'."), name);
604 /* Create variable. */
605 if (width < 0 || width > 255)
606 sys_error (r, _("Bad width %d for variable %s."), width, name);
607 var = dict_create_var (dict, name, width);
610 _("Duplicate variable name `%s' within system file."),
613 /* Set the short name the same as the long name. */
614 var_set_short_name (var, 0, var_get_name (var));
616 /* Get variable label, if any. */
617 if (has_variable_label != 0 && has_variable_label != 1)
618 sys_error (r, _("Variable label indicator field is not 0 or 1."));
619 if (has_variable_label == 1)
621 size_t len, read_len;
626 /* Read up to 255 bytes of label. */
627 read_len = MIN (sizeof label - 1, len);
628 read_string (r, label, read_len + 1);
629 var_set_label (var, label);
631 /* Skip unread label bytes. */
632 skip_bytes (r, len - read_len);
634 /* Skip label padding up to multiple of 4 bytes. */
635 skip_bytes (r, ROUND_UP (len, 4) - len);
638 /* Set missing values. */
639 if (missing_value_code != 0)
641 struct missing_values mv;
644 mv_init_pool (r->pool, &mv, var_get_width (var));
645 if (var_is_numeric (var))
647 if (missing_value_code < -3 || missing_value_code > 3
648 || missing_value_code == -1)
649 sys_error (r, _("Numeric missing value indicator field is not "
650 "-3, -2, 0, 1, 2, or 3."));
651 if (missing_value_code < 0)
653 double low = read_float (r);
654 double high = read_float (r);
655 mv_add_range (&mv, low, high);
656 missing_value_code = -missing_value_code - 2;
658 for (i = 0; i < missing_value_code; i++)
659 mv_add_num (&mv, read_float (r));
663 int mv_width = MAX (width, 8);
666 if (missing_value_code < 1 || missing_value_code > 3)
667 sys_error (r, _("String missing value indicator field is not "
670 value_init (&value, mv_width);
671 value_set_missing (&value, mv_width);
672 for (i = 0; i < missing_value_code; i++)
674 uint8_t *s = value_str_rw (&value, mv_width);
675 read_bytes (r, s, 8);
678 value_destroy (&value, mv_width);
680 var_set_missing_values (var, &mv);
684 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
685 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
687 /* Account for values.
688 Skip long string continuation records, if any. */
689 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
695 for (i = 1; i < nv; i++)
697 /* Check for record type 2 and width -1. */
698 if (read_int (r) != 2 || read_int (r) != -1)
699 sys_error (r, _("Missing string continuation record."));
701 /* Skip and ignore remaining continuation data. */
702 has_variable_label = read_int (r);
703 missing_value_code = read_int (r);
704 print_format = read_int (r);
705 write_format = read_int (r);
706 read_string (r, name, sizeof name);
708 /* Variable label fields on continuation records have
709 been spotted in system files created by "SPSS Power
710 Macintosh Release 6.1". */
711 if (has_variable_label)
712 skip_bytes (r, ROUND_UP (read_int (r), 4));
717 /* Translates the format spec from sysfile format to internal
720 parse_format_spec (struct sfm_reader *r, unsigned int s,
721 enum which_format which, struct variable *v,
722 int *format_warning_cnt)
724 const int max_format_warnings = 8;
726 uint8_t raw_type = s >> 16;
732 if (!fmt_from_io (raw_type, &f.type))
733 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
738 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
743 if (which == PRINT_FORMAT)
744 var_set_print_format (v, &f);
746 var_set_write_format (v, &f);
748 else if (*++format_warning_cnt <= max_format_warnings)
750 char fmt_string[FMT_STRING_LEN_MAX + 1];
751 sys_warn (r, _("%s variable %s has invalid %s format %s."),
752 var_is_numeric (v) ? _("Numeric") : _("String"),
754 which == PRINT_FORMAT ? _("print") : _("write"),
755 fmt_to_string (&f, fmt_string));
757 if (*format_warning_cnt == max_format_warnings)
758 sys_warn (r, _("Suppressing further invalid format warnings."));
762 /* Sets the weighting variable in DICT to the variable
763 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
766 setup_weight (struct sfm_reader *r, int weight_idx,
767 struct variable **var_by_value_idx, struct dictionary *dict)
771 struct variable *weight_var
772 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
773 if (var_is_numeric (weight_var))
774 dict_set_weight (dict, weight_var);
776 sys_error (r, _("Weighting variable must be numeric."));
780 /* Reads a document record, type 6, from system file R, and sets up
781 the documents and n_documents fields in the associated
784 read_documents (struct sfm_reader *r, struct dictionary *dict)
789 if (dict_get_documents (dict) != NULL)
790 sys_error (r, _("Multiple type 6 (document) records."));
792 line_cnt = read_int (r);
794 sys_error (r, _("Number of document lines (%d) "
795 "must be greater than 0."), line_cnt);
797 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
798 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
799 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
800 dict_set_documents (dict, documents);
802 sys_error (r, _("Document line contains null byte."));
803 pool_free (r->pool, documents);
806 /* Read a type 7 extension record. */
808 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
809 struct sfm_read_info *info)
811 int subtype = read_int (r);
812 size_t size = read_int (r);
813 size_t count = read_int (r);
814 size_t bytes = size * count;
816 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
817 allows an extra byte for a null terminator, used by some
818 extension processing routines. */
819 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
820 sys_error (r, "Record type 7 subtype %d too large.", subtype);
825 read_machine_integer_info (r, size, count, info, dict);
829 read_machine_float_info (r, size, count);
833 /* Variable sets information. We don't use these yet.
834 They only apply to GUIs; see VARSETS on the APPLY
835 DICTIONARY command in SPSS documentation. */
839 /* DATE variable information. We don't use it yet, but we
845 read_mrsets (r, size, count, dict);
849 /* Used by the SPSS Data Entry software. */
853 read_display_parameters (r, size, count, dict);
857 read_long_var_name_map (r, size, count, dict);
861 read_long_string_map (r, size, count, dict);
865 /* Extended number of cases. Not important. */
869 read_data_file_attributes (r, size, count, dict);
873 read_variable_attributes (r, size, count, dict);
877 /* New in SPSS 16. Contains a single string that describes
878 the character encoding, e.g. "windows-1252". */
880 char *encoding = pool_calloc (r->pool, size, count + 1);
881 read_string (r, encoding, count + 1);
882 dict_set_encoding (dict, encoding);
887 /* New in SPSS 16. Encodes value labels for long string
889 read_long_string_value_labels (r, size, count, dict);
893 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"),
894 subtype, PACKAGE_BUGREPORT);
898 skip_bytes (r, bytes);
901 /* Read record type 7, subtype 3. */
903 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
904 struct sfm_read_info *info,
905 struct dictionary *dict)
907 int version_major = read_int (r);
908 int version_minor = read_int (r);
909 int version_revision = read_int (r);
910 int machine_code UNUSED = read_int (r);
911 int float_representation = read_int (r);
912 int compression_code UNUSED = read_int (r);
913 int integer_representation = read_int (r);
914 int character_code = read_int (r);
916 int expected_float_format;
917 int expected_integer_format;
919 if (size != 4 || count != 8)
920 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
924 /* Save version info. */
925 info->version_major = version_major;
926 info->version_minor = version_minor;
927 info->version_revision = version_revision;
929 /* Check floating point format. */
930 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
931 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
932 expected_float_format = 1;
933 else if (r->float_format == FLOAT_Z_LONG)
934 expected_float_format = 2;
935 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
936 expected_float_format = 3;
939 if (float_representation != expected_float_format)
940 sys_error (r, _("Floating-point representation indicated by "
941 "system file (%d) differs from expected (%d)."),
942 r->float_format, expected_float_format);
944 /* Check integer format. */
945 if (r->integer_format == INTEGER_MSB_FIRST)
946 expected_integer_format = 1;
947 else if (r->integer_format == INTEGER_LSB_FIRST)
948 expected_integer_format = 2;
951 if (integer_representation != expected_integer_format)
953 static const char *const endian[] = {N_("Little Endian"), N_("Big Endian")};
954 sys_warn (r, _("Integer format indicated by system file (%s) "
955 "differs from expected (%s)."),
956 gettext (endian[integer_representation == 1]),
957 gettext (endian[expected_integer_format == 1]));
962 Record 7 (20) provides a much more reliable way of
963 setting the encoding.
964 The character_code is used as a fallback only.
966 if ( NULL == dict_get_encoding (dict))
968 switch (character_code)
971 dict_set_encoding (dict, "EBCDIC-US");
975 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
976 respectively. However, there are known to be many files
977 in the wild with character code 2, yet have data which are
979 Therefore we ignore these values.
983 dict_set_encoding (dict, "MS_KANJI");
986 dict_set_encoding (dict, "UTF-7");
989 dict_set_encoding (dict, "UTF-8");
994 snprintf (enc, 100, "CP%d", character_code);
995 dict_set_encoding (dict, enc);
1002 /* Read record type 7, subtype 4. */
1004 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
1006 double sysmis = read_float (r);
1007 double highest = read_float (r);
1008 double lowest = read_float (r);
1010 if (size != 8 || count != 3)
1011 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
1014 if (sysmis != SYSMIS)
1015 sys_warn (r, _("File specifies unexpected value %g as %s."),
1018 if (highest != HIGHEST)
1019 sys_warn (r, _("File specifies unexpected value %g as %s."),
1020 highest, "HIGHEST");
1022 if (lowest != LOWEST)
1023 sys_warn (r, _("File specifies unexpected value %g as %s."),
1027 /* Read record type 7, subtype 7 or 19. */
1029 read_mrsets (struct sfm_reader *r, size_t size, size_t count,
1030 struct dictionary *dict)
1032 struct text_record *text;
1033 struct mrset *mrset;
1035 text = open_text_record (r, size * count);
1038 const char *name, *label, *counted;
1039 struct stringi_set var_names;
1040 size_t allocated_vars;
1044 mrset = xzalloc (sizeof *mrset);
1046 name = text_get_token (text, ss_cstr ("="), NULL);
1049 mrset->name = xstrdup (name);
1051 if (text_match (text, 'C'))
1053 mrset->type = MRSET_MC;
1054 if (!text_match (text, ' '))
1056 sys_warn (r, _("Missing space following 'C' at offset %zu "
1057 "in MRSETS record"), text_pos (text));
1061 else if (text_match (text, 'D'))
1063 mrset->type = MRSET_MD;
1064 mrset->cat_source = MRSET_VARLABELS;
1066 else if (text_match (text, 'E'))
1070 mrset->type = MRSET_MD;
1071 mrset->cat_source = MRSET_COUNTEDVALUES;
1072 if (!text_match (text, ' '))
1074 sys_warn (r, _("Missing space following 'E' at offset %zu "
1075 "in MRSETS record"), text_pos (text));
1079 number = text_get_token (text, ss_cstr (" "), NULL);
1080 if (!strcmp (number, "11"))
1081 mrset->label_from_var_label = true;
1082 else if (strcmp (number, "1"))
1083 sys_warn (r, _("Unexpected label source value \"%s\" "
1084 "following 'E' at offset %zu in MRSETS record"),
1085 number, text_pos (text));
1089 sys_warn (r, _("Missing 'C', 'D', or 'E' at offset %zu "
1090 "in MRSETS record."),
1095 if (mrset->type == MRSET_MD)
1097 counted = text_parse_counted_string (r, text);
1098 if (counted == NULL)
1102 label = text_parse_counted_string (r, text);
1105 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1107 stringi_set_init (&var_names);
1112 struct variable *var;
1113 const char *var_name;
1115 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1116 if (var_name == NULL)
1118 sys_warn (r, _("Missing new-line parsing variable names "
1119 "at offset %zu in MRSETS record."),
1124 var = lookup_var_by_short_name (dict, var_name);
1127 if (!stringi_set_insert (&var_names, var_name))
1129 sys_warn (r, _("Duplicate variable name %s "
1130 "at offset %zu in MRSETS record."),
1131 var_name, text_pos (text));
1135 if (mrset->label == NULL && mrset->label_from_var_label
1136 && var_has_label (var))
1137 mrset->label = xstrdup (var_get_label (var));
1140 && var_get_type (var) != var_get_type (mrset->vars[0]))
1142 sys_warn (r, _("MRSET %s contains both string and "
1143 "numeric variables."), name);
1146 width = MIN (width, var_get_width (var));
1148 if (mrset->n_vars >= allocated_vars)
1149 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1150 sizeof *mrset->vars);
1151 mrset->vars[mrset->n_vars++] = var;
1153 while (delimiter != '\n');
1155 if (mrset->n_vars < 2)
1157 sys_warn (r, _("MRSET %s has only %zu variables."), mrset->name,
1159 mrset_destroy (mrset);
1163 if (mrset->type == MRSET_MD)
1165 mrset->width = width;
1166 value_init (&mrset->counted, width);
1168 mrset->counted.f = strtod (counted, NULL);
1170 value_copy_str_rpad (&mrset->counted, width,
1171 (const uint8_t *) counted, ' ');
1174 dict_add_mrset (dict, mrset);
1177 mrset_destroy (mrset);
1178 close_text_record (r, text);
1181 /* Read record type 7, subtype 11, which specifies how variables
1182 should be displayed in GUI environments. */
1184 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
1185 struct dictionary *dict)
1188 bool includes_width;
1189 bool warned = false;
1194 sys_warn (r, _("Bad size %zu on extension 11."), size);
1195 skip_bytes (r, size * count);
1199 n_vars = dict_get_var_cnt (dict);
1200 if (count == 3 * n_vars)
1201 includes_width = true;
1202 else if (count == 2 * n_vars)
1203 includes_width = false;
1206 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
1208 skip_bytes (r, size * count);
1212 for (i = 0; i < n_vars; ++i)
1214 struct variable *v = dict_get_var (dict, i);
1215 int measure = read_int (r);
1216 int width = includes_width ? read_int (r) : 0;
1217 int align = read_int (r);
1219 /* SPSS 14 sometimes seems to set string variables' measure
1221 if (0 == measure && var_is_alpha (v))
1224 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1227 sys_warn (r, _("Invalid variable display parameters "
1228 "for variable %zu (%s). "
1229 "Default parameters substituted."),
1230 i, var_get_name (v));
1235 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1236 : measure == 2 ? MEASURE_ORDINAL
1238 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1239 : align == 1 ? ALIGN_RIGHT
1242 /* Older versions (SPSS 9.0) sometimes set the display
1243 width to zero. This causes confusion in the GUI, so
1244 only set the width if it is nonzero. */
1246 var_set_display_width (v, width);
1250 /* Reads record type 7, subtype 13, which gives the long name
1251 that corresponds to each short name. Modifies variable names
1252 in DICT accordingly. */
1254 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
1255 struct dictionary *dict)
1257 struct text_record *text;
1258 struct variable *var;
1261 text = open_text_record (r, size * count);
1262 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1265 size_t short_name_cnt;
1268 /* Validate long name. */
1269 if (!var_is_valid_name (long_name, false))
1271 sys_warn (r, _("Long variable mapping from %s to invalid "
1272 "variable name `%s'."),
1273 var_get_name (var), long_name);
1277 /* Identify any duplicates. */
1278 if (strcasecmp (var_get_short_name (var, 0), long_name)
1279 && dict_lookup_var (dict, long_name) != NULL)
1281 sys_warn (r, _("Duplicate long variable name `%s' "
1282 "within system file."), long_name);
1286 /* Renaming a variable may clear its short names, but we
1287 want to retain them, so we save them and re-set them
1289 short_name_cnt = var_get_short_name_cnt (var);
1290 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
1291 for (i = 0; i < short_name_cnt; i++)
1293 const char *s = var_get_short_name (var, i);
1294 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1297 /* Set long name. */
1298 dict_rename_var (dict, var, long_name);
1300 /* Restore short names. */
1301 for (i = 0; i < short_name_cnt; i++)
1303 var_set_short_name (var, i, short_names[i]);
1304 free (short_names[i]);
1308 close_text_record (r, text);
1309 r->has_long_var_names = true;
1312 /* Reads record type 7, subtype 14, which gives the real length
1313 of each very long string. Rearranges DICT accordingly. */
1315 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1316 struct dictionary *dict)
1318 struct text_record *text;
1319 struct variable *var;
1322 text = open_text_record (r, size * count);
1323 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1325 size_t idx = var_get_dict_index (var);
1331 length = strtol (length_s, NULL, 10);
1332 if (length < 1 || length > MAX_STRING)
1334 sys_warn (r, _("%s listed as string of invalid length %s "
1335 "in very length string record."),
1336 var_get_name (var), length_s);
1340 /* Check segments. */
1341 segment_cnt = sfm_width_to_segments (length);
1342 if (segment_cnt == 1)
1344 sys_warn (r, _("%s listed in very long string record with width %s, "
1345 "which requires only one segment."),
1346 var_get_name (var), length_s);
1349 if (idx + segment_cnt > dict_get_var_cnt (dict))
1350 sys_error (r, _("Very long string %s overflows dictionary."),
1351 var_get_name (var));
1353 /* Get the short names from the segments and check their
1355 for (i = 0; i < segment_cnt; i++)
1357 struct variable *seg = dict_get_var (dict, idx + i);
1358 int alloc_width = sfm_segment_alloc_width (length, i);
1359 int width = var_get_width (seg);
1362 var_set_short_name (var, i, var_get_short_name (seg, 0));
1363 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1364 sys_error (r, _("Very long string with width %ld has segment %d "
1365 "of width %d (expected %d)"),
1366 length, i, width, alloc_width);
1368 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1369 var_set_width (var, length);
1371 close_text_record (r, text);
1372 dict_compact_values (dict);
1375 /* Reads value labels from sysfile H and inserts them into the
1376 associated dictionary. */
1378 read_value_labels (struct sfm_reader *r,
1379 struct dictionary *dict, struct variable **var_by_value_idx)
1381 struct pool *subpool;
1385 uint8_t raw_value[8]; /* Value as uninterpreted bytes. */
1386 union value value; /* Value. */
1387 char *label; /* Null-terminated label string. */
1390 struct label *labels = NULL;
1391 int label_cnt; /* Number of labels. */
1393 struct variable **var = NULL; /* Associated variables. */
1394 int var_cnt; /* Number of associated variables. */
1395 int max_width; /* Maximum width of string variables. */
1399 subpool = pool_create_subpool (r->pool);
1401 /* Read the type 3 record and record its contents. We can't do
1402 much with the data yet because we don't know whether it is
1403 of numeric or string type. */
1405 /* Read number of labels. */
1406 label_cnt = read_int (r);
1408 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1410 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1415 /* Read each value/label tuple into labels[]. */
1416 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1417 for (i = 0; i < label_cnt; i++)
1419 struct label *label = labels + i;
1420 unsigned char label_len;
1424 read_bytes (r, label->raw_value, sizeof label->raw_value);
1426 /* Read label length. */
1427 read_bytes (r, &label_len, sizeof label_len);
1428 padded_len = ROUND_UP (label_len + 1, 8);
1430 /* Read label, padding. */
1431 label->label = pool_alloc (subpool, padded_len + 1);
1432 read_bytes (r, label->label, padded_len - 1);
1433 label->label[label_len] = 0;
1436 /* Now, read the type 4 record that has the list of variables
1437 to which the value labels are to be applied. */
1439 /* Read record type of type 4 record. */
1440 if (read_int (r) != 4)
1441 sys_error (r, _("Variable index record (type 4) does not immediately "
1442 "follow value label record (type 3) as it should."));
1444 /* Read number of variables associated with value label from type 4
1446 var_cnt = read_int (r);
1447 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1448 sys_error (r, _("Number of variables associated with a value label (%d) "
1449 "is not between 1 and the number of variables (%zu)."),
1450 var_cnt, dict_get_var_cnt (dict));
1452 /* Read the list of variables. */
1453 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1455 for (i = 0; i < var_cnt; i++)
1457 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1458 if (var_get_width (var[i]) > 8)
1459 sys_error (r, _("Value labels may not be added to long string "
1460 "variables (e.g. %s) using records types 3 and 4."),
1461 var_get_name (var[i]));
1462 max_width = MAX (max_width, var_get_width (var[i]));
1465 /* Type check the variables. */
1466 for (i = 1; i < var_cnt; i++)
1467 if (var_get_type (var[i]) != var_get_type (var[0]))
1468 sys_error (r, _("Variables associated with value label are not all of "
1469 "identical type. Variable %s is %s, but variable "
1471 var_get_name (var[0]),
1472 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1473 var_get_name (var[i]),
1474 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1476 /* Fill in labels[].value, now that we know the desired type. */
1477 for (i = 0; i < label_cnt; i++)
1479 struct label *label = labels + i;
1481 value_init_pool (subpool, &label->value, max_width);
1482 if (var_is_alpha (var[0]))
1483 u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
1484 label->raw_value, sizeof label->raw_value, ' ');
1486 label->value.f = float_get_double (r->float_format, label->raw_value);
1489 /* Assign the `value_label's to each variable. */
1490 for (i = 0; i < var_cnt; i++)
1492 struct variable *v = var[i];
1495 /* Add each label to the variable. */
1496 for (j = 0; j < label_cnt; j++)
1498 struct label *label = &labels[j];
1499 if (!var_add_value_label (v, &label->value, label->label))
1501 if (var_is_numeric (var[0]))
1502 sys_warn (r, _("Duplicate value label for %g on %s."),
1503 label->value.f, var_get_name (v));
1505 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1506 max_width, value_str (&label->value, max_width),
1512 pool_destroy (subpool);
1515 /* Reads a set of custom attributes from TEXT into ATTRS.
1516 ATTRS may be a null pointer, in which case the attributes are
1517 read but discarded. */
1519 read_attributes (struct sfm_reader *r, struct text_record *text,
1520 struct attrset *attrs)
1524 struct attribute *attr;
1528 /* Parse the key. */
1529 key = text_get_token (text, ss_cstr ("("), NULL);
1533 attr = attribute_create (key);
1534 for (index = 1; ; index++)
1536 /* Parse the value. */
1540 value = text_get_token (text, ss_cstr ("\n"), NULL);
1543 text_warn (r, text, _("Error parsing attribute value %s[%d]"),
1548 length = strlen (value);
1549 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1551 value[length - 1] = '\0';
1552 attribute_add_value (attr, value + 1);
1557 _("Attribute value %s[%d] is not quoted: %s"),
1559 attribute_add_value (attr, value);
1562 /* Was this the last value for this attribute? */
1563 if (text_match (text, ')'))
1567 attrset_add (attrs, attr);
1569 attribute_destroy (attr);
1571 while (!text_match (text, '/'));
1574 /* Reads record type 7, subtype 17, which lists custom
1575 attributes on the data file. */
1577 read_data_file_attributes (struct sfm_reader *r,
1578 size_t size, size_t count,
1579 struct dictionary *dict)
1581 struct text_record *text = open_text_record (r, size * count);
1582 read_attributes (r, text, dict_get_attributes (dict));
1583 close_text_record (r, text);
1587 skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels)
1591 for (i = 0; i < n_labels; i++)
1593 size_t value_length, label_length;
1595 value_length = read_int (r);
1596 skip_bytes (r, value_length);
1597 label_length = read_int (r);
1598 skip_bytes (r, label_length);
1603 read_long_string_value_labels (struct sfm_reader *r,
1604 size_t size, size_t count,
1605 struct dictionary *d)
1607 const off_t start = ftello (r->file);
1608 while (ftello (r->file) - start < size * count)
1610 char var_name[VAR_NAME_LEN + 1];
1618 var_name_len = read_int (r);
1619 if (var_name_len > VAR_NAME_LEN)
1620 sys_error (r, _("Variable name length in long string value label "
1621 "record (%d) exceeds %d-byte limit."),
1622 var_name_len, VAR_NAME_LEN);
1623 read_string (r, var_name, var_name_len + 1);
1624 width = read_int (r);
1625 n_labels = read_int (r);
1627 v = dict_lookup_var (d, var_name);
1630 sys_warn (r, _("Ignoring long string value record for "
1631 "unknown variable %s."), var_name);
1632 skip_long_string_value_labels (r, n_labels);
1635 if (var_is_numeric (v))
1637 sys_warn (r, _("Ignoring long string value record for "
1638 "numeric variable %s."), var_name);
1639 skip_long_string_value_labels (r, n_labels);
1642 if (width != var_get_width (v))
1644 sys_warn (r, _("Ignoring long string value record for variable %s "
1645 "because the record's width (%d) does not match the "
1646 "variable's width (%d)"),
1647 var_name, width, var_get_width (v));
1648 skip_long_string_value_labels (r, n_labels);
1653 value_init_pool (r->pool, &value, width);
1654 for (i = 0; i < n_labels; i++)
1656 size_t value_length, label_length;
1661 value_length = read_int (r);
1662 if (value_length == width)
1663 read_bytes (r, value_str_rw (&value, width), width);
1666 sys_warn (r, _("Ignoring long string value %zu for variable %s, "
1667 "with width %d, that has bad value width %zu."),
1668 i, var_get_name (v), width, value_length);
1669 skip_bytes (r, value_length);
1674 label_length = read_int (r);
1675 read_string (r, label, MIN (sizeof label, label_length + 1));
1676 if (label_length >= sizeof label)
1678 /* Skip and silently ignore label text after the
1679 first 255 bytes. The maximum documented length
1680 of a label is 120 bytes so this is more than
1682 skip_bytes (r, (label_length + 1) - sizeof label);
1685 if (!skip && !var_add_value_label (v, &value, label))
1686 sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
1687 width, value_str (&value, width), var_get_name (v));
1693 /* Reads record type 7, subtype 18, which lists custom
1694 attributes on individual variables. */
1696 read_variable_attributes (struct sfm_reader *r,
1697 size_t size, size_t count,
1698 struct dictionary *dict)
1700 struct text_record *text = open_text_record (r, size * count);
1703 struct variable *var;
1704 if (!text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1706 read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1708 close_text_record (r, text);
1714 static void partial_record (struct sfm_reader *r)
1717 static void read_error (struct casereader *, const struct sfm_reader *);
1719 static bool read_case_number (struct sfm_reader *, double *);
1720 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1721 static int read_opcode (struct sfm_reader *);
1722 static bool read_compressed_number (struct sfm_reader *, double *);
1723 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1724 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1725 static bool skip_whole_strings (struct sfm_reader *, size_t);
1727 /* Reads and returns one case from READER's file. Returns a null
1728 pointer if not successful. */
1729 static struct ccase *
1730 sys_file_casereader_read (struct casereader *reader, void *r_)
1732 struct sfm_reader *r = r_;
1733 struct ccase *volatile c;
1739 c = case_create (r->proto);
1740 if (setjmp (r->bail_out))
1742 casereader_force_error (reader);
1747 for (i = 0; i < r->sfm_var_cnt; i++)
1749 struct sfm_var *sv = &r->sfm_vars[i];
1750 union value *v = case_data_rw_idx (c, sv->case_index);
1752 if (sv->var_width == 0)
1754 if (!read_case_number (r, &v->f))
1759 uint8_t *s = value_str_rw (v, sv->var_width);
1760 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1762 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1772 if (r->case_cnt != -1)
1773 read_error (reader, r);
1777 /* Issues an error that R ends in a partial record. */
1779 partial_record (struct sfm_reader *r)
1781 sys_error (r, _("File ends in partial case."));
1784 /* Issues an error that an unspecified error occurred SFM, and
1787 read_error (struct casereader *r, const struct sfm_reader *sfm)
1789 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1790 casereader_force_error (r);
1793 /* Reads a number from R and stores its value in *D.
1794 If R is compressed, reads a compressed number;
1795 otherwise, reads a number in the regular way.
1796 Returns true if successful, false if end of file is
1797 reached immediately. */
1799 read_case_number (struct sfm_reader *r, double *d)
1804 if (!try_read_bytes (r, number, sizeof number))
1806 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1810 return read_compressed_number (r, d);
1813 /* Reads LENGTH string bytes from R into S.
1814 Always reads a multiple of 8 bytes; if LENGTH is not a
1815 multiple of 8, then extra bytes are read and discarded without
1817 Reads compressed strings if S is compressed.
1818 Returns true if successful, false if end of file is
1819 reached immediately. */
1821 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
1823 size_t whole = ROUND_DOWN (length, 8);
1824 size_t partial = length % 8;
1828 if (!read_whole_strings (r, s, whole))
1835 if (!read_whole_strings (r, bounce, sizeof bounce))
1841 memcpy (s + whole, bounce, partial);
1847 /* Reads and returns the next compression opcode from R. */
1849 read_opcode (struct sfm_reader *r)
1851 assert (r->compressed);
1855 if (r->opcode_idx >= sizeof r->opcodes)
1857 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1861 opcode = r->opcodes[r->opcode_idx++];
1868 /* Reads a compressed number from R and stores its value in D.
1869 Returns true if successful, false if end of file is
1870 reached immediately. */
1872 read_compressed_number (struct sfm_reader *r, double *d)
1874 int opcode = read_opcode (r);
1882 *d = read_float (r);
1886 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
1887 if (!r->corruption_warning)
1889 r->corruption_warning = true;
1890 sys_warn (r, _("Possible compressed data corruption: "
1891 "compressed spaces appear in numeric field."));
1900 *d = opcode - r->bias;
1907 /* Reads a compressed 8-byte string segment from R and stores it
1909 Returns true if successful, false if end of file is
1910 reached immediately. */
1912 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
1914 int opcode = read_opcode (r);
1922 read_bytes (r, dst, 8);
1926 memset (dst, ' ', 8);
1931 double value = opcode - r->bias;
1932 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
1935 /* This has actually been seen "in the wild". The submitter of the
1936 file that showed that the contents decoded as spaces, but they
1937 were at the end of the field so it's possible that the null
1938 bytes just acted as null terminators. */
1940 else if (!r->corruption_warning)
1942 r->corruption_warning = true;
1943 sys_warn (r, _("Possible compressed data corruption: "
1944 "string contains compressed integer (opcode %d)"),
1954 /* Reads LENGTH string bytes from R into S.
1955 LENGTH must be a multiple of 8.
1956 Reads compressed strings if S is compressed.
1957 Returns true if successful, false if end of file is
1958 reached immediately. */
1960 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
1962 assert (length % 8 == 0);
1964 return try_read_bytes (r, s, length);
1968 for (ofs = 0; ofs < length; ofs += 8)
1969 if (!read_compressed_string (r, s + ofs))
1979 /* Skips LENGTH string bytes from R.
1980 LENGTH must be a multiple of 8.
1981 (LENGTH is also limited to 1024, but that's only because the
1982 current caller never needs more than that many bytes.)
1983 Returns true if successful, false if end of file is
1984 reached immediately. */
1986 skip_whole_strings (struct sfm_reader *r, size_t length)
1988 uint8_t buffer[1024];
1989 assert (length < sizeof buffer);
1990 return read_whole_strings (r, buffer, length);
1993 /* Creates and returns a table that can be used for translating a value
1994 index into a case to a "struct variable *" for DICT. Multiple
1995 system file fields reference variables this way.
1997 This table must be created before processing the very long
1998 string extension record, because that record causes some
1999 values to be deleted from the case and the dictionary to be
2001 static struct variable **
2002 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
2004 struct variable **var_by_value_idx;
2008 var_by_value_idx = pool_nmalloc (r->pool,
2009 r->oct_cnt, sizeof *var_by_value_idx);
2010 for (i = 0; i < dict_get_var_cnt (dict); i++)
2012 struct variable *v = dict_get_var (dict, i);
2013 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
2016 var_by_value_idx[value_idx++] = v;
2017 for (j = 1; j < nv; j++)
2018 var_by_value_idx[value_idx++] = NULL;
2020 assert (value_idx == r->oct_cnt);
2022 return var_by_value_idx;
2025 /* Returns the "struct variable" corresponding to the given
2026 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
2028 static struct variable *
2029 lookup_var_by_value_idx (struct sfm_reader *r,
2030 struct variable **var_by_value_idx, int value_idx)
2032 struct variable *var;
2034 if (value_idx < 1 || value_idx > r->oct_cnt)
2035 sys_error (r, _("Variable index %d not in valid range 1...%d."),
2036 value_idx, r->oct_cnt);
2038 var = var_by_value_idx[value_idx - 1];
2040 sys_error (r, _("Variable index %d refers to long string "
2047 /* Returns the variable in D with the given SHORT_NAME,
2048 or a null pointer if there is none. */
2049 static struct variable *
2050 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
2052 struct variable *var;
2056 /* First try looking up by full name. This often succeeds. */
2057 var = dict_lookup_var (d, short_name);
2058 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
2061 /* Iterate through the whole dictionary as a fallback. */
2062 var_cnt = dict_get_var_cnt (d);
2063 for (i = 0; i < var_cnt; i++)
2065 var = dict_get_var (d, i);
2066 if (!strcasecmp (var_get_short_name (var, 0), short_name))
2073 /* Helpers for reading records that contain structured text
2076 /* Maximum number of warnings to issue for a single text
2078 #define MAX_TEXT_WARNINGS 5
2083 struct substring buffer; /* Record contents. */
2084 size_t pos; /* Current position in buffer. */
2085 int n_warnings; /* Number of warnings issued or suppressed. */
2088 /* Reads SIZE bytes into a text record for R,
2089 and returns the new text record. */
2090 static struct text_record *
2091 open_text_record (struct sfm_reader *r, size_t size)
2093 struct text_record *text = pool_alloc (r->pool, sizeof *text);
2094 char *buffer = pool_malloc (r->pool, size + 1);
2095 read_bytes (r, buffer, size);
2096 text->buffer = ss_buffer (buffer, size);
2098 text->n_warnings = 0;
2102 /* Closes TEXT, frees its storage, and issues a final warning
2103 about suppressed warnings if necesary. */
2105 close_text_record (struct sfm_reader *r, struct text_record *text)
2107 if (text->n_warnings > MAX_TEXT_WARNINGS)
2108 sys_warn (r, _("Suppressed %d additional related warnings."),
2109 text->n_warnings - MAX_TEXT_WARNINGS);
2110 pool_free (r->pool, ss_data (text->buffer));
2113 /* Reads a variable=value pair from TEXT.
2114 Looks up the variable in DICT and stores it into *VAR.
2115 Stores a null-terminated value into *VALUE. */
2117 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2118 struct text_record *text,
2119 struct variable **var, char **value)
2123 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2126 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2130 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2131 ss_buffer ("\t\0", 2));
2139 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2140 struct text_record *text, struct substring delimiters,
2141 struct variable **var)
2145 name = text_get_token (text, delimiters, NULL);
2149 *var = dict_lookup_var (dict, name);
2153 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2160 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2161 struct text_record *text, struct substring delimiters,
2162 struct variable **var)
2164 char *short_name = text_get_token (text, delimiters, NULL);
2165 if (short_name == NULL)
2168 *var = lookup_var_by_short_name (dict, short_name);
2170 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2175 /* Displays a warning for the current file position, limiting the
2176 number to MAX_TEXT_WARNINGS for TEXT. */
2178 text_warn (struct sfm_reader *r, struct text_record *text,
2179 const char *format, ...)
2181 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2185 va_start (args, format);
2186 sys_msg (r, MW, format, args);
2192 text_get_token (struct text_record *text, struct substring delimiters,
2195 struct substring token;
2198 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2201 end = &ss_data (token)[ss_length (token)];
2202 if (delimiter != NULL)
2205 return ss_data (token);
2208 /* Reads a integer value expressed in decimal, then a space, then a string that
2209 consists of exactly as many bytes as specified by the integer, then a space,
2210 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2211 buffer (so the caller should not free the string). */
2213 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2223 int c = text->buffer.string[text->pos];
2224 if (c < '0' || c > '9')
2226 n = (n * 10) + (c - '0');
2229 if (start == text->pos)
2231 sys_warn (r, _("Expecting digit at offset %zu in MRSETS record."),
2236 if (!text_match (text, ' '))
2238 sys_warn (r, _("Expecting space at offset %zu in MRSETS record."),
2243 if (text->pos + n > text->buffer.length)
2245 sys_warn (r, _("%zu-byte string starting at offset %zu "
2246 "exceeds record length %zu."),
2247 n, text->pos, text->buffer.length);
2251 s = &text->buffer.string[text->pos];
2255 _("Expecting space at offset %zu following %zu-byte string."),
2265 text_match (struct text_record *text, char c)
2267 if (text->buffer.string[text->pos] == c)
2276 /* Returns the current byte offset inside the TEXT's string. */
2278 text_pos (const struct text_record *text)
2285 /* Displays a corruption message. */
2287 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
2292 ds_init_empty (&text);
2293 ds_put_format (&text, "\"%s\" near offset 0x%llx: ",
2294 fh_get_file_name (r->fh), (long long int) ftello (r->file));
2295 ds_put_vformat (&text, format, args);
2297 m.category = msg_class_to_category (class);
2298 m.severity = msg_class_to_severity (class);
2299 m.where.file_name = NULL;
2300 m.where.line_number = 0;
2301 m.text = ds_cstr (&text);
2306 /* Displays a warning for the current file position. */
2308 sys_warn (struct sfm_reader *r, const char *format, ...)
2312 va_start (args, format);
2313 sys_msg (r, MW, format, args);
2317 /* Displays an error for the current file position,
2318 marks it as in an error state,
2319 and aborts reading it using longjmp. */
2321 sys_error (struct sfm_reader *r, const char *format, ...)
2325 va_start (args, format);
2326 sys_msg (r, ME, format, args);
2330 longjmp (r->bail_out, 1);
2333 /* Reads BYTE_CNT bytes into BUF.
2334 Returns true if exactly BYTE_CNT bytes are successfully read.
2335 Aborts if an I/O error or a partial read occurs.
2336 If EOF_IS_OK, then an immediate end-of-file causes false to be
2337 returned; otherwise, immediate end-of-file causes an abort
2340 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2341 void *buf, size_t byte_cnt)
2343 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2344 if (bytes_read == byte_cnt)
2346 else if (ferror (r->file))
2347 sys_error (r, _("System error: %s."), strerror (errno));
2348 else if (!eof_is_ok || bytes_read != 0)
2349 sys_error (r, _("Unexpected end of file."));
2354 /* Reads BYTE_CNT into BUF.
2355 Aborts upon I/O error or if end-of-file is encountered. */
2357 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2359 read_bytes_internal (r, false, buf, byte_cnt);
2362 /* Reads BYTE_CNT bytes into BUF.
2363 Returns true if exactly BYTE_CNT bytes are successfully read.
2364 Returns false if an immediate end-of-file is encountered.
2365 Aborts if an I/O error or a partial read occurs. */
2367 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2369 return read_bytes_internal (r, true, buf, byte_cnt);
2372 /* Reads a 32-bit signed integer from R and returns its value in
2375 read_int (struct sfm_reader *r)
2378 read_bytes (r, integer, sizeof integer);
2379 return integer_get (r->integer_format, integer, sizeof integer);
2382 /* Reads a 64-bit floating-point number from R and returns its
2383 value in host format. */
2385 read_float (struct sfm_reader *r)
2388 read_bytes (r, number, sizeof number);
2389 return float_get_double (r->float_format, number);
2392 /* Reads exactly SIZE - 1 bytes into BUFFER
2393 and stores a null byte into BUFFER[SIZE - 1]. */
2395 read_string (struct sfm_reader *r, char *buffer, size_t size)
2398 read_bytes (r, buffer, size - 1);
2399 buffer[size - 1] = '\0';
2402 /* Skips BYTES bytes forward in R. */
2404 skip_bytes (struct sfm_reader *r, size_t bytes)
2409 size_t chunk = MIN (sizeof buffer, bytes);
2410 read_bytes (r, buffer, chunk);
2415 static const struct casereader_class sys_file_casereader_class =
2417 sys_file_casereader_read,
2418 sys_file_casereader_destroy,