1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/mrset.h"
38 #include "data/short-names.h"
39 #include "data/value-labels.h"
40 #include "data/value.h"
41 #include "data/variable.h"
42 #include "libpspp/array.h"
43 #include "libpspp/assertion.h"
44 #include "libpspp/compiler.h"
45 #include "libpspp/i18n.h"
46 #include "libpspp/message.h"
47 #include "libpspp/misc.h"
48 #include "libpspp/pool.h"
49 #include "libpspp/str.h"
50 #include "libpspp/stringi-set.h"
52 #include "gl/c-ctype.h"
53 #include "gl/inttostr.h"
54 #include "gl/minmax.h"
55 #include "gl/unlocked-io.h"
56 #include "gl/xalloc.h"
60 #define _(msgid) gettext (msgid)
61 #define N_(msgid) (msgid)
63 /* System file reader. */
66 /* Resource tracking. */
67 struct pool *pool; /* All system file state. */
68 jmp_buf bail_out; /* longjmp() target for error handling. */
71 struct file_handle *fh; /* File handle. */
72 struct fh_lock *lock; /* Mutual exclusion for file handle. */
73 FILE *file; /* File stream. */
74 bool error; /* I/O or corruption error? */
75 struct caseproto *proto; /* Format of output cases. */
78 enum integer_format integer_format; /* On-disk integer format. */
79 enum float_format float_format; /* On-disk floating point format. */
80 int oct_cnt; /* Number of 8-byte units per case. */
81 struct sfm_var *sfm_vars; /* Variables. */
82 size_t sfm_var_cnt; /* Number of variables. */
83 casenumber case_cnt; /* Number of cases */
84 bool has_long_var_names; /* File has a long variable name map */
87 bool compressed; /* File is compressed? */
88 double bias; /* Compression bias, usually 100.0. */
89 uint8_t opcodes[8]; /* Current block of opcodes. */
90 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
91 bool corruption_warning; /* Warned about possible corruption? */
94 static const struct casereader_class sys_file_casereader_class;
96 static bool close_reader (struct sfm_reader *);
98 static struct variable **make_var_by_value_idx (struct sfm_reader *,
100 static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
103 static struct variable *lookup_var_by_short_name (struct dictionary *,
104 const char *short_name);
106 static void sys_msg (struct sfm_reader *r, int class,
107 const char *format, va_list args)
108 PRINTF_FORMAT (3, 0);
109 static void sys_warn (struct sfm_reader *, const char *, ...)
110 PRINTF_FORMAT (2, 3);
111 static void sys_error (struct sfm_reader *, const char *, ...)
115 static void read_bytes (struct sfm_reader *, void *, size_t);
116 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
117 static int read_int (struct sfm_reader *);
118 static double read_float (struct sfm_reader *);
119 static void read_string (struct sfm_reader *, char *, size_t);
120 static void skip_bytes (struct sfm_reader *, size_t);
122 static struct text_record *open_text_record (struct sfm_reader *, size_t size);
123 static void close_text_record (struct sfm_reader *r,
124 struct text_record *);
125 static bool read_variable_to_value_pair (struct sfm_reader *,
127 struct text_record *,
128 struct variable **var, char **value);
129 static void text_warn (struct sfm_reader *r, struct text_record *text,
130 const char *format, ...)
131 PRINTF_FORMAT (3, 4);
132 static char *text_get_token (struct text_record *,
133 struct substring delimiters, char *delimiter);
134 static bool text_match (struct text_record *, char c);
135 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
136 struct text_record *,
137 struct substring delimiters,
139 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
140 struct text_record *,
141 struct substring delimiters,
143 static const char *text_parse_counted_string (struct sfm_reader *,
144 struct text_record *);
145 static size_t text_pos (const struct text_record *);
147 static bool close_reader (struct sfm_reader *r);
149 /* Dictionary reader. */
157 static void read_header (struct sfm_reader *, struct dictionary *,
158 int *weight_idx, int *claimed_oct_cnt,
159 struct sfm_read_info *);
160 static void read_variable_record (struct sfm_reader *, struct dictionary *,
161 int *format_warning_cnt);
162 static void parse_format_spec (struct sfm_reader *, unsigned int,
163 enum which_format, struct variable *,
164 int *format_warning_cnt);
165 static void setup_weight (struct sfm_reader *, int weight_idx,
166 struct variable **var_by_value_idx,
167 struct dictionary *);
168 static void read_documents (struct sfm_reader *, struct dictionary *);
169 static void read_value_labels (struct sfm_reader *, struct dictionary *,
170 struct variable **var_by_value_idx);
172 static void read_extension_record (struct sfm_reader *, struct dictionary *,
173 struct sfm_read_info *);
174 static void read_machine_integer_info (struct sfm_reader *,
175 size_t size, size_t count,
176 struct sfm_read_info *,
179 static void read_machine_float_info (struct sfm_reader *,
180 size_t size, size_t count);
181 static void read_mrsets (struct sfm_reader *, size_t size, size_t count,
182 struct dictionary *);
183 static void read_display_parameters (struct sfm_reader *,
184 size_t size, size_t count,
185 struct dictionary *);
186 static void read_long_var_name_map (struct sfm_reader *,
187 size_t size, size_t count,
188 struct dictionary *);
189 static void read_long_string_map (struct sfm_reader *,
190 size_t size, size_t count,
191 struct dictionary *);
192 static void read_data_file_attributes (struct sfm_reader *,
193 size_t size, size_t count,
194 struct dictionary *);
195 static void read_variable_attributes (struct sfm_reader *,
196 size_t size, size_t count,
197 struct dictionary *);
198 static void read_long_string_value_labels (struct sfm_reader *,
199 size_t size, size_t count,
200 struct dictionary *);
202 /* Convert all the strings in DICT from the dict encoding to UTF8 */
204 recode_strings (struct dictionary *dict)
208 const char *enc = dict_get_encoding (dict);
211 enc = get_default_encoding ();
213 for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
215 /* Convert the long variable name */
216 struct variable *var = dict_get_var (dict, i);
217 const char *native_name = var_get_name (var);
218 char *utf8_name = recode_string (UTF8, enc, native_name, -1);
219 if ( 0 != strcmp (utf8_name, native_name))
221 if ( NULL == dict_lookup_var (dict, utf8_name))
222 dict_rename_var (dict, var, utf8_name);
225 _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name);
230 /* Convert the variable label */
231 if (var_has_label (var))
233 char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
234 var_set_label (var, utf8_label);
238 if (var_has_value_labels (var))
240 const struct val_lab *vl = NULL;
241 const struct val_labs *vlabs = var_get_value_labels (var);
243 for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl))
245 const union value *val = val_lab_get_value (vl);
246 const char *label = val_lab_get_label (vl);
247 char *new_label = NULL;
249 new_label = recode_string (UTF8, enc, label, -1);
251 var_replace_value_label (var, val, new_label);
258 /* Opens the system file designated by file handle FH for
259 reading. Reads the system file's dictionary into *DICT.
260 If INFO is non-null, then it receives additional info about the
263 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
264 struct sfm_read_info *volatile info)
266 struct sfm_reader *volatile r = NULL;
267 struct variable **var_by_value_idx;
268 struct sfm_read_info local_info;
269 int format_warning_cnt = 0;
274 *dict = dict_create ();
276 /* Create and initialize reader. */
277 r = pool_create_container (struct sfm_reader, pool);
283 r->has_long_var_names = false;
284 r->opcode_idx = sizeof r->opcodes;
285 r->corruption_warning = false;
287 /* TRANSLATORS: this fragment will be interpolated into
288 messages in fh_lock() that identify types of files. */
289 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
293 r->file = fn_open (fh_get_file_name (fh), "rb");
296 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
297 fh_get_file_name (r->fh), strerror (errno));
301 /* Initialize info. */
304 memset (info, 0, sizeof *info);
306 if (setjmp (r->bail_out))
311 read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
313 /* Read all the variable definition records. */
314 rec_type = read_int (r);
315 while (rec_type == 2)
317 read_variable_record (r, *dict, &format_warning_cnt);
318 rec_type = read_int (r);
321 /* Figure out the case format. */
322 var_by_value_idx = make_var_by_value_idx (r, *dict);
323 setup_weight (r, weight_idx, var_by_value_idx, *dict);
325 /* Read all the rest of the dictionary records. */
326 while (rec_type != 999)
331 read_value_labels (r, *dict, var_by_value_idx);
335 sys_error (r, _("Misplaced type 4 record."));
338 read_documents (r, *dict);
342 read_extension_record (r, *dict, info);
346 sys_error (r, _("Unrecognized record type %d."), rec_type);
348 rec_type = read_int (r);
352 if ( ! r->has_long_var_names )
355 for (i = 0; i < dict_get_var_cnt (*dict); i++)
357 struct variable *var = dict_get_var (*dict, i);
358 char short_name[SHORT_NAME_LEN + 1];
359 char long_name[SHORT_NAME_LEN + 1];
361 strcpy (short_name, var_get_name (var));
363 strcpy (long_name, short_name);
364 str_lowercase (long_name);
366 /* Set long name. Renaming a variable may clear the short
367 name, but we want to retain it, so re-set it
369 dict_rename_var (*dict, var, long_name);
370 var_set_short_name (var, 0, short_name);
373 r->has_long_var_names = true;
376 recode_strings (*dict);
378 /* Read record 999 data, which is just filler. */
381 /* Warn if the actual amount of data per case differs from the
382 amount that the header claims. SPSS version 13 gets this
383 wrong when very long strings are involved, so don't warn in
385 if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
386 && info->version_major != 13)
387 sys_warn (r, _("File header claims %d variable positions but "
388 "%d were read from file."),
389 claimed_oct_cnt, r->oct_cnt);
391 /* Create an index of dictionary variable widths for
392 sfm_read_case to use. We cannot use the `struct variable's
393 from the dictionary we created, because the caller owns the
394 dictionary and may destroy or modify its variables. */
395 sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
396 pool_register (r->pool, free, r->sfm_vars);
397 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
399 pool_free (r->pool, var_by_value_idx);
400 return casereader_create_sequential
402 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
403 &sys_file_casereader_class, r);
407 dict_destroy (*dict);
412 /* Closes a system file after we're done with it.
413 Returns true if an I/O error has occurred on READER, false
416 close_reader (struct sfm_reader *r)
425 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
427 msg (ME, _("Error closing system file `%s': %s."),
428 fh_get_file_name (r->fh), strerror (errno));
438 pool_destroy (r->pool);
443 /* Destroys READER. */
445 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
447 struct sfm_reader *r = r_;
451 /* Returns true if FILE is an SPSS system file,
454 sfm_detect (FILE *file)
458 if (fread (rec_type, 4, 1, file) != 1)
462 return !strcmp ("$FL2", rec_type);
465 /* Reads the global header of the system file.
466 Sets DICT's file label to the system file's label.
467 Sets *WEIGHT_IDX to 0 if the system file is unweighted,
468 or to the value index of the weight variable otherwise.
469 Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
470 per case that the file claims to have (although it is not
472 Initializes INFO with header information. */
474 read_header (struct sfm_reader *r, struct dictionary *dict,
475 int *weight_idx, int *claimed_oct_cnt,
476 struct sfm_read_info *info)
479 char eye_catcher[61];
480 uint8_t raw_layout_code[4];
482 char creation_date[10];
483 char creation_time[9];
485 struct substring file_label_ss;
486 struct substring product;
488 read_string (r, rec_type, sizeof rec_type);
489 read_string (r, eye_catcher, sizeof eye_catcher);
491 if (strcmp ("$FL2", rec_type) != 0)
492 sys_error (r, _("This is not an SPSS system file."));
494 /* Identify integer format. */
495 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
496 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
498 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
500 || (r->integer_format != INTEGER_MSB_FIRST
501 && r->integer_format != INTEGER_LSB_FIRST))
502 sys_error (r, _("This is not an SPSS system file."));
504 *claimed_oct_cnt = read_int (r);
505 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
506 *claimed_oct_cnt = -1;
508 r->compressed = read_int (r) != 0;
510 *weight_idx = read_int (r);
512 r->case_cnt = read_int (r);
513 if ( r->case_cnt > INT_MAX / 2)
517 /* Identify floating-point format and obtain compression bias. */
518 read_bytes (r, raw_bias, sizeof raw_bias);
519 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
521 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
523 if (memcmp (raw_bias, zero_bias, 8))
524 sys_warn (r, _("Compression bias is not the usual "
525 "value of 100, or system file uses unrecognized "
526 "floating-point format."));
529 /* Some software is known to write all-zeros to this
530 field. Such software also writes floating-point
531 numbers in the format that we expect by default
532 (it seems that all software most likely does, in
533 reality), so don't warn in this case. */
536 if (r->integer_format == INTEGER_MSB_FIRST)
537 r->float_format = FLOAT_IEEE_DOUBLE_BE;
539 r->float_format = FLOAT_IEEE_DOUBLE_LE;
541 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
543 read_string (r, creation_date, sizeof creation_date);
544 read_string (r, creation_time, sizeof creation_time);
545 read_string (r, file_label, sizeof file_label);
548 file_label_ss = ss_cstr (file_label);
549 ss_trim (&file_label_ss, ss_cstr (" "));
550 if (!ss_is_empty (file_label_ss))
552 ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
553 dict_set_label (dict, ss_data (file_label_ss));
556 strcpy (info->creation_date, creation_date);
557 strcpy (info->creation_time, creation_time);
558 info->integer_format = r->integer_format;
559 info->float_format = r->float_format;
560 info->compressed = r->compressed;
561 info->case_cnt = r->case_cnt;
563 product = ss_cstr (eye_catcher);
564 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
565 ss_trim (&product, ss_cstr (" "));
566 str_copy_buf_trunc (info->product, sizeof info->product,
567 ss_data (product), ss_length (product));
570 /* Reads a variable (type 2) record from R and adds the
571 corresponding variable to DICT.
572 Also skips past additional variable records for long string
575 read_variable_record (struct sfm_reader *r, struct dictionary *dict,
576 int *format_warning_cnt)
579 int has_variable_label;
580 int missing_value_code;
585 struct variable *var;
588 width = read_int (r);
589 has_variable_label = read_int (r);
590 missing_value_code = read_int (r);
591 print_format = read_int (r);
592 write_format = read_int (r);
593 read_string (r, name, sizeof name);
594 name[strcspn (name, " ")] = '\0';
596 /* Check variable name. */
597 if (name[0] == '$' || name[0] == '#')
598 sys_error (r, _("Variable name begins with invalid character `%c'."),
600 if (!var_is_plausible_name (name, false))
601 sys_error (r, _("Invalid variable name `%s'."), name);
603 /* Create variable. */
604 if (width < 0 || width > 255)
605 sys_error (r, _("Bad width %d for variable %s."), width, name);
606 var = dict_create_var (dict, name, width);
609 _("Duplicate variable name `%s' within system file."),
612 /* Set the short name the same as the long name. */
613 var_set_short_name (var, 0, var_get_name (var));
615 /* Get variable label, if any. */
616 if (has_variable_label != 0 && has_variable_label != 1)
617 sys_error (r, _("Variable label indicator field is not 0 or 1."));
618 if (has_variable_label == 1)
620 size_t len, read_len;
625 /* Read up to 255 bytes of label. */
626 read_len = MIN (sizeof label - 1, len);
627 read_string (r, label, read_len + 1);
628 var_set_label (var, label);
630 /* Skip unread label bytes. */
631 skip_bytes (r, len - read_len);
633 /* Skip label padding up to multiple of 4 bytes. */
634 skip_bytes (r, ROUND_UP (len, 4) - len);
637 /* Set missing values. */
638 if (missing_value_code != 0)
640 struct missing_values mv;
643 mv_init_pool (r->pool, &mv, var_get_width (var));
644 if (var_is_numeric (var))
646 if (missing_value_code < -3 || missing_value_code > 3
647 || missing_value_code == -1)
648 sys_error (r, _("Numeric missing value indicator field is not "
649 "-3, -2, 0, 1, 2, or 3."));
650 if (missing_value_code < 0)
652 double low = read_float (r);
653 double high = read_float (r);
654 mv_add_range (&mv, low, high);
655 missing_value_code = -missing_value_code - 2;
657 for (i = 0; i < missing_value_code; i++)
658 mv_add_num (&mv, read_float (r));
662 int mv_width = MAX (width, 8);
665 if (missing_value_code < 1 || missing_value_code > 3)
666 sys_error (r, _("String missing value indicator field is not "
669 value_init (&value, mv_width);
670 value_set_missing (&value, mv_width);
671 for (i = 0; i < missing_value_code; i++)
673 uint8_t *s = value_str_rw (&value, mv_width);
674 read_bytes (r, s, 8);
677 value_destroy (&value, mv_width);
679 var_set_missing_values (var, &mv);
683 parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
684 parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
686 /* Account for values.
687 Skip long string continuation records, if any. */
688 nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
694 for (i = 1; i < nv; i++)
696 /* Check for record type 2 and width -1. */
697 if (read_int (r) != 2 || read_int (r) != -1)
698 sys_error (r, _("Missing string continuation record."));
700 /* Skip and ignore remaining continuation data. */
701 has_variable_label = read_int (r);
702 missing_value_code = read_int (r);
703 print_format = read_int (r);
704 write_format = read_int (r);
705 read_string (r, name, sizeof name);
707 /* Variable label fields on continuation records have
708 been spotted in system files created by "SPSS Power
709 Macintosh Release 6.1". */
710 if (has_variable_label)
711 skip_bytes (r, ROUND_UP (read_int (r), 4));
716 /* Translates the format spec from sysfile format to internal
719 parse_format_spec (struct sfm_reader *r, unsigned int s,
720 enum which_format which, struct variable *v,
721 int *format_warning_cnt)
723 const int max_format_warnings = 8;
725 uint8_t raw_type = s >> 16;
731 if (!fmt_from_io (raw_type, &f.type))
732 sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
737 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
742 if (which == PRINT_FORMAT)
743 var_set_print_format (v, &f);
745 var_set_write_format (v, &f);
747 else if (++*format_warning_cnt <= max_format_warnings)
749 char fmt_string[FMT_STRING_LEN_MAX + 1];
750 sys_warn (r, _("%s variable %s has invalid %s format %s."),
751 var_is_numeric (v) ? _("Numeric") : _("String"),
753 which == PRINT_FORMAT ? _("print") : _("write"),
754 fmt_to_string (&f, fmt_string));
756 if (*format_warning_cnt == max_format_warnings)
757 sys_warn (r, _("Suppressing further invalid format warnings."));
761 /* Sets the weighting variable in DICT to the variable
762 corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
765 setup_weight (struct sfm_reader *r, int weight_idx,
766 struct variable **var_by_value_idx, struct dictionary *dict)
770 struct variable *weight_var
771 = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
772 if (var_is_numeric (weight_var))
773 dict_set_weight (dict, weight_var);
775 sys_error (r, _("Weighting variable must be numeric."));
779 /* Reads a document record, type 6, from system file R, and sets up
780 the documents and n_documents fields in the associated
783 read_documents (struct sfm_reader *r, struct dictionary *dict)
788 if (dict_get_documents (dict) != NULL)
789 sys_error (r, _("Multiple type 6 (document) records."));
791 line_cnt = read_int (r);
793 sys_error (r, _("Number of document lines (%d) "
794 "must be greater than 0."), line_cnt);
796 documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
797 read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
798 if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
799 dict_set_documents (dict, documents);
801 sys_error (r, _("Document line contains null byte."));
802 pool_free (r->pool, documents);
805 /* Read a type 7 extension record. */
807 read_extension_record (struct sfm_reader *r, struct dictionary *dict,
808 struct sfm_read_info *info)
810 int subtype = read_int (r);
811 size_t size = read_int (r);
812 size_t count = read_int (r);
813 size_t bytes = size * count;
815 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
816 allows an extra byte for a null terminator, used by some
817 extension processing routines. */
818 if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
819 sys_error (r, "Record type 7 subtype %d too large.", subtype);
824 read_machine_integer_info (r, size, count, info, dict);
828 read_machine_float_info (r, size, count);
832 /* Variable sets information. We don't use these yet.
833 They only apply to GUIs; see VARSETS on the APPLY
834 DICTIONARY command in SPSS documentation. */
838 /* DATE variable information. We don't use it yet, but we
844 read_mrsets (r, size, count, dict);
848 /* Used by the SPSS Data Entry software. */
852 read_display_parameters (r, size, count, dict);
856 read_long_var_name_map (r, size, count, dict);
860 read_long_string_map (r, size, count, dict);
864 /* Extended number of cases. Not important. */
868 read_data_file_attributes (r, size, count, dict);
872 read_variable_attributes (r, size, count, dict);
876 /* New in SPSS 16. Contains a single string that describes
877 the character encoding, e.g. "windows-1252". */
879 char *encoding = pool_calloc (r->pool, size, count + 1);
880 read_string (r, encoding, count + 1);
881 dict_set_encoding (dict, encoding);
886 /* New in SPSS 16. Encodes value labels for long string
888 read_long_string_value_labels (r, size, count, dict);
892 sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send "
893 "a copy of this file, and the syntax which created it "
895 subtype, PACKAGE_BUGREPORT);
899 skip_bytes (r, bytes);
902 /* Read record type 7, subtype 3. */
904 read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
905 struct sfm_read_info *info,
906 struct dictionary *dict)
908 int version_major = read_int (r);
909 int version_minor = read_int (r);
910 int version_revision = read_int (r);
911 int machine_code UNUSED = read_int (r);
912 int float_representation = read_int (r);
913 int compression_code UNUSED = read_int (r);
914 int integer_representation = read_int (r);
915 int character_code = read_int (r);
917 int expected_float_format;
918 int expected_integer_format;
920 if (size != 4 || count != 8)
921 sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
925 /* Save version info. */
926 info->version_major = version_major;
927 info->version_minor = version_minor;
928 info->version_revision = version_revision;
930 /* Check floating point format. */
931 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
932 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
933 expected_float_format = 1;
934 else if (r->float_format == FLOAT_Z_LONG)
935 expected_float_format = 2;
936 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
937 expected_float_format = 3;
940 if (float_representation != expected_float_format)
941 sys_error (r, _("Floating-point representation indicated by "
942 "system file (%d) differs from expected (%d)."),
943 r->float_format, expected_float_format);
945 /* Check integer format. */
946 if (r->integer_format == INTEGER_MSB_FIRST)
947 expected_integer_format = 1;
948 else if (r->integer_format == INTEGER_LSB_FIRST)
949 expected_integer_format = 2;
952 if (integer_representation != expected_integer_format)
954 static const char *const endian[] = {N_("Little Endian"), N_("Big Endian")};
955 sys_warn (r, _("Integer format indicated by system file (%s) "
956 "differs from expected (%s)."),
957 gettext (endian[integer_representation == 1]),
958 gettext (endian[expected_integer_format == 1]));
963 Record 7 (20) provides a much more reliable way of
964 setting the encoding.
965 The character_code is used as a fallback only.
967 if ( NULL == dict_get_encoding (dict))
969 switch (character_code)
972 dict_set_encoding (dict, "EBCDIC-US");
976 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
977 respectively. However, there are known to be many files
978 in the wild with character code 2, yet have data which are
980 Therefore we ignore these values.
984 dict_set_encoding (dict, "MS_KANJI");
987 dict_set_encoding (dict, "UTF-7");
990 dict_set_encoding (dict, "UTF-8");
995 snprintf (enc, 100, "CP%d", character_code);
996 dict_set_encoding (dict, enc);
1003 /* Read record type 7, subtype 4. */
1005 read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
1007 double sysmis = read_float (r);
1008 double highest = read_float (r);
1009 double lowest = read_float (r);
1011 if (size != 8 || count != 3)
1012 sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
1015 if (sysmis != SYSMIS)
1016 sys_warn (r, _("File specifies unexpected value %g as %s."),
1019 if (highest != HIGHEST)
1020 sys_warn (r, _("File specifies unexpected value %g as %s."),
1021 highest, "HIGHEST");
1023 if (lowest != LOWEST)
1024 sys_warn (r, _("File specifies unexpected value %g as %s."),
1028 /* Read record type 7, subtype 7 or 19. */
1030 read_mrsets (struct sfm_reader *r, size_t size, size_t count,
1031 struct dictionary *dict)
1033 struct text_record *text;
1034 struct mrset *mrset;
1036 text = open_text_record (r, size * count);
1039 const char *name, *label, *counted;
1040 struct stringi_set var_names;
1041 size_t allocated_vars;
1045 mrset = xzalloc (sizeof *mrset);
1047 name = text_get_token (text, ss_cstr ("="), NULL);
1050 mrset->name = xstrdup (name);
1052 if (mrset->name[0] != '$')
1054 sys_warn (r, _("`%s' does not begin with `$' at offset %zu "
1055 "in MRSETS record."), mrset->name, text_pos (text));
1059 if (text_match (text, 'C'))
1061 mrset->type = MRSET_MC;
1062 if (!text_match (text, ' '))
1064 sys_warn (r, _("Missing space following `%c' at offset %zu "
1065 "in MRSETS record."), 'C', text_pos (text));
1069 else if (text_match (text, 'D'))
1071 mrset->type = MRSET_MD;
1072 mrset->cat_source = MRSET_VARLABELS;
1074 else if (text_match (text, 'E'))
1078 mrset->type = MRSET_MD;
1079 mrset->cat_source = MRSET_COUNTEDVALUES;
1080 if (!text_match (text, ' '))
1082 sys_warn (r, _("Missing space following `%c' at offset %zu "
1083 "in MRSETS record."), 'E', text_pos (text));
1087 number = text_get_token (text, ss_cstr (" "), NULL);
1088 if (!strcmp (number, "11"))
1089 mrset->label_from_var_label = true;
1090 else if (strcmp (number, "1"))
1091 sys_warn (r, _("Unexpected label source value `%s' "
1092 "following `E' at offset %zu in MRSETS record."),
1093 number, text_pos (text));
1097 sys_warn (r, _("Missing `C', `D', or `E' at offset %zu "
1098 "in MRSETS record."),
1103 if (mrset->type == MRSET_MD)
1105 counted = text_parse_counted_string (r, text);
1106 if (counted == NULL)
1110 label = text_parse_counted_string (r, text);
1113 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1115 stringi_set_init (&var_names);
1120 struct variable *var;
1121 const char *var_name;
1123 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1124 if (var_name == NULL)
1126 sys_warn (r, _("Missing new-line parsing variable names "
1127 "at offset %zu in MRSETS record."),
1132 var = lookup_var_by_short_name (dict, var_name);
1135 if (!stringi_set_insert (&var_names, var_name))
1137 sys_warn (r, _("Duplicate variable name %s "
1138 "at offset %zu in MRSETS record."),
1139 var_name, text_pos (text));
1143 if (mrset->label == NULL && mrset->label_from_var_label
1144 && var_has_label (var))
1145 mrset->label = xstrdup (var_get_label (var));
1148 && var_get_type (var) != var_get_type (mrset->vars[0]))
1150 sys_warn (r, _("MRSET %s contains both string and "
1151 "numeric variables."), name);
1154 width = MIN (width, var_get_width (var));
1156 if (mrset->n_vars >= allocated_vars)
1157 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1158 sizeof *mrset->vars);
1159 mrset->vars[mrset->n_vars++] = var;
1161 while (delimiter != '\n');
1163 if (mrset->n_vars < 2)
1165 sys_warn (r, _("MRSET %s has only %zu variables."), mrset->name,
1167 mrset_destroy (mrset);
1171 if (mrset->type == MRSET_MD)
1173 mrset->width = width;
1174 value_init (&mrset->counted, width);
1176 mrset->counted.f = strtod (counted, NULL);
1178 value_copy_str_rpad (&mrset->counted, width,
1179 (const uint8_t *) counted, ' ');
1182 dict_add_mrset (dict, mrset);
1184 stringi_set_destroy (&var_names);
1186 mrset_destroy (mrset);
1187 close_text_record (r, text);
1190 /* Read record type 7, subtype 11, which specifies how variables
1191 should be displayed in GUI environments. */
1193 read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
1194 struct dictionary *dict)
1197 bool includes_width;
1198 bool warned = false;
1203 sys_warn (r, _("Bad size %zu on extension 11."), size);
1204 skip_bytes (r, size * count);
1208 n_vars = dict_get_var_cnt (dict);
1209 if (count == 3 * n_vars)
1210 includes_width = true;
1211 else if (count == 2 * n_vars)
1212 includes_width = false;
1215 sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
1217 skip_bytes (r, size * count);
1221 for (i = 0; i < n_vars; ++i)
1223 struct variable *v = dict_get_var (dict, i);
1224 int measure = read_int (r);
1225 int width = includes_width ? read_int (r) : 0;
1226 int align = read_int (r);
1228 /* SPSS 14 sometimes seems to set string variables' measure
1230 if (0 == measure && var_is_alpha (v))
1233 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1236 sys_warn (r, _("Invalid variable display parameters "
1237 "for variable %zu (%s). "
1238 "Default parameters substituted."),
1239 i, var_get_name (v));
1244 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1245 : measure == 2 ? MEASURE_ORDINAL
1247 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1248 : align == 1 ? ALIGN_RIGHT
1251 /* Older versions (SPSS 9.0) sometimes set the display
1252 width to zero. This causes confusion in the GUI, so
1253 only set the width if it is nonzero. */
1255 var_set_display_width (v, width);
1259 /* Reads record type 7, subtype 13, which gives the long name
1260 that corresponds to each short name. Modifies variable names
1261 in DICT accordingly. */
1263 read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
1264 struct dictionary *dict)
1266 struct text_record *text;
1267 struct variable *var;
1270 text = open_text_record (r, size * count);
1271 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1274 size_t short_name_cnt;
1277 /* Validate long name. */
1278 if (!var_is_valid_name (long_name, false))
1280 sys_warn (r, _("Long variable mapping from %s to invalid "
1281 "variable name `%s'."),
1282 var_get_name (var), long_name);
1286 /* Identify any duplicates. */
1287 if (strcasecmp (var_get_short_name (var, 0), long_name)
1288 && dict_lookup_var (dict, long_name) != NULL)
1290 sys_warn (r, _("Duplicate long variable name `%s' "
1291 "within system file."), long_name);
1295 /* Renaming a variable may clear its short names, but we
1296 want to retain them, so we save them and re-set them
1298 short_name_cnt = var_get_short_name_cnt (var);
1299 short_names = xnmalloc (short_name_cnt, sizeof *short_names);
1300 for (i = 0; i < short_name_cnt; i++)
1302 const char *s = var_get_short_name (var, i);
1303 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1306 /* Set long name. */
1307 dict_rename_var (dict, var, long_name);
1309 /* Restore short names. */
1310 for (i = 0; i < short_name_cnt; i++)
1312 var_set_short_name (var, i, short_names[i]);
1313 free (short_names[i]);
1317 close_text_record (r, text);
1318 r->has_long_var_names = true;
1321 /* Reads record type 7, subtype 14, which gives the real length
1322 of each very long string. Rearranges DICT accordingly. */
1324 read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
1325 struct dictionary *dict)
1327 struct text_record *text;
1328 struct variable *var;
1331 text = open_text_record (r, size * count);
1332 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1334 size_t idx = var_get_dict_index (var);
1340 length = strtol (length_s, NULL, 10);
1341 if (length < 1 || length > MAX_STRING)
1343 sys_warn (r, _("%s listed as string of invalid length %s "
1344 "in very long string record."),
1345 var_get_name (var), length_s);
1349 /* Check segments. */
1350 segment_cnt = sfm_width_to_segments (length);
1351 if (segment_cnt == 1)
1353 sys_warn (r, _("%s listed in very long string record with width %s, "
1354 "which requires only one segment."),
1355 var_get_name (var), length_s);
1358 if (idx + segment_cnt > dict_get_var_cnt (dict))
1359 sys_error (r, _("Very long string %s overflows dictionary."),
1360 var_get_name (var));
1362 /* Get the short names from the segments and check their
1364 for (i = 0; i < segment_cnt; i++)
1366 struct variable *seg = dict_get_var (dict, idx + i);
1367 int alloc_width = sfm_segment_alloc_width (length, i);
1368 int width = var_get_width (seg);
1371 var_set_short_name (var, i, var_get_short_name (seg, 0));
1372 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1373 sys_error (r, _("Very long string with width %ld has segment %d "
1374 "of width %d (expected %d)."),
1375 length, i, width, alloc_width);
1377 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1378 var_set_width (var, length);
1380 close_text_record (r, text);
1381 dict_compact_values (dict);
1384 /* Reads value labels from sysfile H and inserts them into the
1385 associated dictionary. */
1387 read_value_labels (struct sfm_reader *r,
1388 struct dictionary *dict, struct variable **var_by_value_idx)
1390 struct pool *subpool;
1394 uint8_t raw_value[8]; /* Value as uninterpreted bytes. */
1395 union value value; /* Value. */
1396 char *label; /* Null-terminated label string. */
1399 struct label *labels = NULL;
1400 int label_cnt; /* Number of labels. */
1402 struct variable **var = NULL; /* Associated variables. */
1403 int var_cnt; /* Number of associated variables. */
1404 int max_width; /* Maximum width of string variables. */
1408 subpool = pool_create_subpool (r->pool);
1410 /* Read the type 3 record and record its contents. We can't do
1411 much with the data yet because we don't know whether it is
1412 of numeric or string type. */
1414 /* Read number of labels. */
1415 label_cnt = read_int (r);
1417 if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
1419 sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
1424 /* Read each value/label tuple into labels[]. */
1425 labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
1426 for (i = 0; i < label_cnt; i++)
1428 struct label *label = labels + i;
1429 unsigned char label_len;
1433 read_bytes (r, label->raw_value, sizeof label->raw_value);
1435 /* Read label length. */
1436 read_bytes (r, &label_len, sizeof label_len);
1437 padded_len = ROUND_UP (label_len + 1, 8);
1439 /* Read label, padding. */
1440 label->label = pool_alloc (subpool, padded_len + 1);
1441 read_bytes (r, label->label, padded_len - 1);
1442 label->label[label_len] = 0;
1445 /* Now, read the type 4 record that has the list of variables
1446 to which the value labels are to be applied. */
1448 /* Read record type of type 4 record. */
1449 if (read_int (r) != 4)
1450 sys_error (r, _("Variable index record (type 4) does not immediately "
1451 "follow value label record (type 3) as it should."));
1453 /* Read number of variables associated with value label from type 4
1455 var_cnt = read_int (r);
1456 if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
1457 sys_error (r, _("Number of variables associated with a value label (%d) "
1458 "is not between 1 and the number of variables (%zu)."),
1459 var_cnt, dict_get_var_cnt (dict));
1461 /* Read the list of variables. */
1462 var = pool_nalloc (subpool, var_cnt, sizeof *var);
1464 for (i = 0; i < var_cnt; i++)
1466 var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
1467 if (var_get_width (var[i]) > 8)
1468 sys_error (r, _("Value labels may not be added to long string "
1469 "variables (e.g. %s) using records types 3 and 4."),
1470 var_get_name (var[i]));
1471 max_width = MAX (max_width, var_get_width (var[i]));
1474 /* Type check the variables. */
1475 for (i = 1; i < var_cnt; i++)
1476 if (var_get_type (var[i]) != var_get_type (var[0]))
1477 sys_error (r, _("Variables associated with value label are not all of "
1478 "identical type. Variable %s is %s, but variable "
1480 var_get_name (var[0]),
1481 var_is_numeric (var[0]) ? _("numeric") : _("string"),
1482 var_get_name (var[i]),
1483 var_is_numeric (var[i]) ? _("numeric") : _("string"));
1485 /* Fill in labels[].value, now that we know the desired type. */
1486 for (i = 0; i < label_cnt; i++)
1488 struct label *label = labels + i;
1490 value_init_pool (subpool, &label->value, max_width);
1491 if (var_is_alpha (var[0]))
1492 u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
1493 label->raw_value, sizeof label->raw_value, ' ');
1495 label->value.f = float_get_double (r->float_format, label->raw_value);
1498 /* Assign the `value_label's to each variable. */
1499 for (i = 0; i < var_cnt; i++)
1501 struct variable *v = var[i];
1504 /* Add each label to the variable. */
1505 for (j = 0; j < label_cnt; j++)
1507 struct label *label = &labels[j];
1508 if (!var_add_value_label (v, &label->value, label->label))
1510 if (var_is_numeric (var[0]))
1511 sys_warn (r, _("Duplicate value label for %g on %s."),
1512 label->value.f, var_get_name (v));
1514 sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
1515 max_width, value_str (&label->value, max_width),
1521 pool_destroy (subpool);
1524 /* Reads a set of custom attributes from TEXT into ATTRS.
1525 ATTRS may be a null pointer, in which case the attributes are
1526 read but discarded. */
1528 read_attributes (struct sfm_reader *r, struct text_record *text,
1529 struct attrset *attrs)
1533 struct attribute *attr;
1537 /* Parse the key. */
1538 key = text_get_token (text, ss_cstr ("("), NULL);
1542 attr = attribute_create (key);
1543 for (index = 1; ; index++)
1545 /* Parse the value. */
1549 value = text_get_token (text, ss_cstr ("\n"), NULL);
1552 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1557 length = strlen (value);
1558 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1560 value[length - 1] = '\0';
1561 attribute_add_value (attr, value + 1);
1566 _("Attribute value %s[%d] is not quoted: %s."),
1568 attribute_add_value (attr, value);
1571 /* Was this the last value for this attribute? */
1572 if (text_match (text, ')'))
1576 attrset_add (attrs, attr);
1578 attribute_destroy (attr);
1580 while (!text_match (text, '/'));
1583 /* Reads record type 7, subtype 17, which lists custom
1584 attributes on the data file. */
1586 read_data_file_attributes (struct sfm_reader *r,
1587 size_t size, size_t count,
1588 struct dictionary *dict)
1590 struct text_record *text = open_text_record (r, size * count);
1591 read_attributes (r, text, dict_get_attributes (dict));
1592 close_text_record (r, text);
1596 skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels)
1600 for (i = 0; i < n_labels; i++)
1602 size_t value_length, label_length;
1604 value_length = read_int (r);
1605 skip_bytes (r, value_length);
1606 label_length = read_int (r);
1607 skip_bytes (r, label_length);
1612 read_long_string_value_labels (struct sfm_reader *r,
1613 size_t size, size_t count,
1614 struct dictionary *d)
1616 const off_t start = ftello (r->file);
1617 while (ftello (r->file) - start < size * count)
1619 char var_name[VAR_NAME_LEN + 1];
1627 var_name_len = read_int (r);
1628 if (var_name_len > VAR_NAME_LEN)
1629 sys_error (r, _("Variable name length in long string value label "
1630 "record (%d) exceeds %d-byte limit."),
1631 var_name_len, VAR_NAME_LEN);
1632 read_string (r, var_name, var_name_len + 1);
1633 width = read_int (r);
1634 n_labels = read_int (r);
1636 v = dict_lookup_var (d, var_name);
1639 sys_warn (r, _("Ignoring long string value record for "
1640 "unknown variable %s."), var_name);
1641 skip_long_string_value_labels (r, n_labels);
1644 if (var_is_numeric (v))
1646 sys_warn (r, _("Ignoring long string value record for "
1647 "numeric variable %s."), var_name);
1648 skip_long_string_value_labels (r, n_labels);
1651 if (width != var_get_width (v))
1653 sys_warn (r, _("Ignoring long string value record for variable %s "
1654 "because the record's width (%d) does not match the "
1655 "variable's width (%d)."),
1656 var_name, width, var_get_width (v));
1657 skip_long_string_value_labels (r, n_labels);
1662 value_init_pool (r->pool, &value, width);
1663 for (i = 0; i < n_labels; i++)
1665 size_t value_length, label_length;
1670 value_length = read_int (r);
1671 if (value_length == width)
1672 read_bytes (r, value_str_rw (&value, width), width);
1675 sys_warn (r, _("Ignoring long string value %zu for variable %s, "
1676 "with width %d, that has bad value width %zu."),
1677 i, var_get_name (v), width, value_length);
1678 skip_bytes (r, value_length);
1683 label_length = read_int (r);
1684 read_string (r, label, MIN (sizeof label, label_length + 1));
1685 if (label_length >= sizeof label)
1687 /* Skip and silently ignore label text after the
1688 first 255 bytes. The maximum documented length
1689 of a label is 120 bytes so this is more than
1691 skip_bytes (r, (label_length + 1) - sizeof label);
1694 if (!skip && !var_add_value_label (v, &value, label))
1695 sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
1696 width, value_str (&value, width), var_get_name (v));
1702 /* Reads record type 7, subtype 18, which lists custom
1703 attributes on individual variables. */
1705 read_variable_attributes (struct sfm_reader *r,
1706 size_t size, size_t count,
1707 struct dictionary *dict)
1709 struct text_record *text = open_text_record (r, size * count);
1712 struct variable *var;
1713 if (!text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1715 read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1717 close_text_record (r, text);
1723 static void partial_record (struct sfm_reader *r)
1726 static void read_error (struct casereader *, const struct sfm_reader *);
1728 static bool read_case_number (struct sfm_reader *, double *);
1729 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1730 static int read_opcode (struct sfm_reader *);
1731 static bool read_compressed_number (struct sfm_reader *, double *);
1732 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1733 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1734 static bool skip_whole_strings (struct sfm_reader *, size_t);
1736 /* Reads and returns one case from READER's file. Returns a null
1737 pointer if not successful. */
1738 static struct ccase *
1739 sys_file_casereader_read (struct casereader *reader, void *r_)
1741 struct sfm_reader *r = r_;
1742 struct ccase *volatile c;
1748 c = case_create (r->proto);
1749 if (setjmp (r->bail_out))
1751 casereader_force_error (reader);
1756 for (i = 0; i < r->sfm_var_cnt; i++)
1758 struct sfm_var *sv = &r->sfm_vars[i];
1759 union value *v = case_data_rw_idx (c, sv->case_index);
1761 if (sv->var_width == 0)
1763 if (!read_case_number (r, &v->f))
1768 uint8_t *s = value_str_rw (v, sv->var_width);
1769 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1771 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
1780 if (r->case_cnt != -1)
1781 read_error (reader, r);
1786 /* Issues an error that R ends in a partial record. */
1788 partial_record (struct sfm_reader *r)
1790 sys_error (r, _("File ends in partial case."));
1793 /* Issues an error that an unspecified error occurred SFM, and
1796 read_error (struct casereader *r, const struct sfm_reader *sfm)
1798 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
1799 casereader_force_error (r);
1802 /* Reads a number from R and stores its value in *D.
1803 If R is compressed, reads a compressed number;
1804 otherwise, reads a number in the regular way.
1805 Returns true if successful, false if end of file is
1806 reached immediately. */
1808 read_case_number (struct sfm_reader *r, double *d)
1813 if (!try_read_bytes (r, number, sizeof number))
1815 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
1819 return read_compressed_number (r, d);
1822 /* Reads LENGTH string bytes from R into S.
1823 Always reads a multiple of 8 bytes; if LENGTH is not a
1824 multiple of 8, then extra bytes are read and discarded without
1826 Reads compressed strings if S is compressed.
1827 Returns true if successful, false if end of file is
1828 reached immediately. */
1830 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
1832 size_t whole = ROUND_DOWN (length, 8);
1833 size_t partial = length % 8;
1837 if (!read_whole_strings (r, s, whole))
1844 if (!read_whole_strings (r, bounce, sizeof bounce))
1850 memcpy (s + whole, bounce, partial);
1856 /* Reads and returns the next compression opcode from R. */
1858 read_opcode (struct sfm_reader *r)
1860 assert (r->compressed);
1864 if (r->opcode_idx >= sizeof r->opcodes)
1866 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
1870 opcode = r->opcodes[r->opcode_idx++];
1877 /* Reads a compressed number from R and stores its value in D.
1878 Returns true if successful, false if end of file is
1879 reached immediately. */
1881 read_compressed_number (struct sfm_reader *r, double *d)
1883 int opcode = read_opcode (r);
1891 *d = read_float (r);
1895 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
1896 if (!r->corruption_warning)
1898 r->corruption_warning = true;
1899 sys_warn (r, _("Possible compressed data corruption: "
1900 "compressed spaces appear in numeric field."));
1909 *d = opcode - r->bias;
1916 /* Reads a compressed 8-byte string segment from R and stores it
1918 Returns true if successful, false if end of file is
1919 reached immediately. */
1921 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
1923 int opcode = read_opcode (r);
1931 read_bytes (r, dst, 8);
1935 memset (dst, ' ', 8);
1940 double value = opcode - r->bias;
1941 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
1944 /* This has actually been seen "in the wild". The submitter of the
1945 file that showed that the contents decoded as spaces, but they
1946 were at the end of the field so it's possible that the null
1947 bytes just acted as null terminators. */
1949 else if (!r->corruption_warning)
1951 r->corruption_warning = true;
1952 sys_warn (r, _("Possible compressed data corruption: "
1953 "string contains compressed integer (opcode %d)."),
1963 /* Reads LENGTH string bytes from R into S.
1964 LENGTH must be a multiple of 8.
1965 Reads compressed strings if S is compressed.
1966 Returns true if successful, false if end of file is
1967 reached immediately. */
1969 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
1971 assert (length % 8 == 0);
1973 return try_read_bytes (r, s, length);
1977 for (ofs = 0; ofs < length; ofs += 8)
1978 if (!read_compressed_string (r, s + ofs))
1988 /* Skips LENGTH string bytes from R.
1989 LENGTH must be a multiple of 8.
1990 (LENGTH is also limited to 1024, but that's only because the
1991 current caller never needs more than that many bytes.)
1992 Returns true if successful, false if end of file is
1993 reached immediately. */
1995 skip_whole_strings (struct sfm_reader *r, size_t length)
1997 uint8_t buffer[1024];
1998 assert (length < sizeof buffer);
1999 return read_whole_strings (r, buffer, length);
2002 /* Creates and returns a table that can be used for translating a value
2003 index into a case to a "struct variable *" for DICT. Multiple
2004 system file fields reference variables this way.
2006 This table must be created before processing the very long
2007 string extension record, because that record causes some
2008 values to be deleted from the case and the dictionary to be
2010 static struct variable **
2011 make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
2013 struct variable **var_by_value_idx;
2017 var_by_value_idx = pool_nmalloc (r->pool,
2018 r->oct_cnt, sizeof *var_by_value_idx);
2019 for (i = 0; i < dict_get_var_cnt (dict); i++)
2021 struct variable *v = dict_get_var (dict, i);
2022 int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
2025 var_by_value_idx[value_idx++] = v;
2026 for (j = 1; j < nv; j++)
2027 var_by_value_idx[value_idx++] = NULL;
2029 assert (value_idx == r->oct_cnt);
2031 return var_by_value_idx;
2034 /* Returns the "struct variable" corresponding to the given
2035 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
2037 static struct variable *
2038 lookup_var_by_value_idx (struct sfm_reader *r,
2039 struct variable **var_by_value_idx, int value_idx)
2041 struct variable *var;
2043 if (value_idx < 1 || value_idx > r->oct_cnt)
2044 sys_error (r, _("Variable index %d not in valid range 1...%d."),
2045 value_idx, r->oct_cnt);
2047 var = var_by_value_idx[value_idx - 1];
2049 sys_error (r, _("Variable index %d refers to long string "
2056 /* Returns the variable in D with the given SHORT_NAME,
2057 or a null pointer if there is none. */
2058 static struct variable *
2059 lookup_var_by_short_name (struct dictionary *d, const char *short_name)
2061 struct variable *var;
2065 /* First try looking up by full name. This often succeeds. */
2066 var = dict_lookup_var (d, short_name);
2067 if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
2070 /* Iterate through the whole dictionary as a fallback. */
2071 var_cnt = dict_get_var_cnt (d);
2072 for (i = 0; i < var_cnt; i++)
2074 var = dict_get_var (d, i);
2075 if (!strcasecmp (var_get_short_name (var, 0), short_name))
2082 /* Helpers for reading records that contain structured text
2085 /* Maximum number of warnings to issue for a single text
2087 #define MAX_TEXT_WARNINGS 5
2092 struct substring buffer; /* Record contents. */
2093 size_t pos; /* Current position in buffer. */
2094 int n_warnings; /* Number of warnings issued or suppressed. */
2097 /* Reads SIZE bytes into a text record for R,
2098 and returns the new text record. */
2099 static struct text_record *
2100 open_text_record (struct sfm_reader *r, size_t size)
2102 struct text_record *text = pool_alloc (r->pool, sizeof *text);
2103 char *buffer = pool_malloc (r->pool, size + 1);
2104 read_bytes (r, buffer, size);
2105 text->buffer = ss_buffer (buffer, size);
2107 text->n_warnings = 0;
2111 /* Closes TEXT, frees its storage, and issues a final warning
2112 about suppressed warnings if necesary. */
2114 close_text_record (struct sfm_reader *r, struct text_record *text)
2116 if (text->n_warnings > MAX_TEXT_WARNINGS)
2117 sys_warn (r, _("Suppressed %d additional related warnings."),
2118 text->n_warnings - MAX_TEXT_WARNINGS);
2119 pool_free (r->pool, ss_data (text->buffer));
2122 /* Reads a variable=value pair from TEXT.
2123 Looks up the variable in DICT and stores it into *VAR.
2124 Stores a null-terminated value into *VALUE. */
2126 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2127 struct text_record *text,
2128 struct variable **var, char **value)
2132 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2135 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2139 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2140 ss_buffer ("\t\0", 2));
2148 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2149 struct text_record *text, struct substring delimiters,
2150 struct variable **var)
2154 name = text_get_token (text, delimiters, NULL);
2158 *var = dict_lookup_var (dict, name);
2162 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2169 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2170 struct text_record *text, struct substring delimiters,
2171 struct variable **var)
2173 char *short_name = text_get_token (text, delimiters, NULL);
2174 if (short_name == NULL)
2177 *var = lookup_var_by_short_name (dict, short_name);
2179 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2184 /* Displays a warning for the current file position, limiting the
2185 number to MAX_TEXT_WARNINGS for TEXT. */
2187 text_warn (struct sfm_reader *r, struct text_record *text,
2188 const char *format, ...)
2190 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2194 va_start (args, format);
2195 sys_msg (r, MW, format, args);
2201 text_get_token (struct text_record *text, struct substring delimiters,
2204 struct substring token;
2207 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2210 end = &ss_data (token)[ss_length (token)];
2211 if (delimiter != NULL)
2214 return ss_data (token);
2217 /* Reads a integer value expressed in decimal, then a space, then a string that
2218 consists of exactly as many bytes as specified by the integer, then a space,
2219 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2220 buffer (so the caller should not free the string). */
2222 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2232 int c = text->buffer.string[text->pos];
2233 if (c < '0' || c > '9')
2235 n = (n * 10) + (c - '0');
2238 if (start == text->pos)
2240 sys_warn (r, _("Expecting digit at offset %zu in MRSETS record."),
2245 if (!text_match (text, ' '))
2247 sys_warn (r, _("Expecting space at offset %zu in MRSETS record."),
2252 if (text->pos + n > text->buffer.length)
2254 sys_warn (r, _("%zu-byte string starting at offset %zu "
2255 "exceeds record length %zu."),
2256 n, text->pos, text->buffer.length);
2260 s = &text->buffer.string[text->pos];
2264 _("Expecting space at offset %zu following %zu-byte string."),
2274 text_match (struct text_record *text, char c)
2276 if (text->buffer.string[text->pos] == c)
2285 /* Returns the current byte offset inside the TEXT's string. */
2287 text_pos (const struct text_record *text)
2294 /* Displays a corruption message. */
2296 sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
2301 ds_init_empty (&text);
2302 ds_put_format (&text, "`%s' near offset 0x%llx: ",
2303 fh_get_file_name (r->fh), (long long int) ftello (r->file));
2304 ds_put_vformat (&text, format, args);
2306 m.category = msg_class_to_category (class);
2307 m.severity = msg_class_to_severity (class);
2308 m.where.file_name = NULL;
2309 m.where.line_number = 0;
2310 m.where.first_column = 0;
2311 m.where.last_column = 0;
2312 m.text = ds_cstr (&text);
2317 /* Displays a warning for the current file position. */
2319 sys_warn (struct sfm_reader *r, const char *format, ...)
2323 va_start (args, format);
2324 sys_msg (r, MW, format, args);
2328 /* Displays an error for the current file position,
2329 marks it as in an error state,
2330 and aborts reading it using longjmp. */
2332 sys_error (struct sfm_reader *r, const char *format, ...)
2336 va_start (args, format);
2337 sys_msg (r, ME, format, args);
2341 longjmp (r->bail_out, 1);
2344 /* Reads BYTE_CNT bytes into BUF.
2345 Returns true if exactly BYTE_CNT bytes are successfully read.
2346 Aborts if an I/O error or a partial read occurs.
2347 If EOF_IS_OK, then an immediate end-of-file causes false to be
2348 returned; otherwise, immediate end-of-file causes an abort
2351 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2352 void *buf, size_t byte_cnt)
2354 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2355 if (bytes_read == byte_cnt)
2357 else if (ferror (r->file))
2358 sys_error (r, _("System error: %s."), strerror (errno));
2359 else if (!eof_is_ok || bytes_read != 0)
2360 sys_error (r, _("Unexpected end of file."));
2365 /* Reads BYTE_CNT into BUF.
2366 Aborts upon I/O error or if end-of-file is encountered. */
2368 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2370 read_bytes_internal (r, false, buf, byte_cnt);
2373 /* Reads BYTE_CNT bytes into BUF.
2374 Returns true if exactly BYTE_CNT bytes are successfully read.
2375 Returns false if an immediate end-of-file is encountered.
2376 Aborts if an I/O error or a partial read occurs. */
2378 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2380 return read_bytes_internal (r, true, buf, byte_cnt);
2383 /* Reads a 32-bit signed integer from R and returns its value in
2386 read_int (struct sfm_reader *r)
2389 read_bytes (r, integer, sizeof integer);
2390 return integer_get (r->integer_format, integer, sizeof integer);
2393 /* Reads a 64-bit floating-point number from R and returns its
2394 value in host format. */
2396 read_float (struct sfm_reader *r)
2399 read_bytes (r, number, sizeof number);
2400 return float_get_double (r->float_format, number);
2403 /* Reads exactly SIZE - 1 bytes into BUFFER
2404 and stores a null byte into BUFFER[SIZE - 1]. */
2406 read_string (struct sfm_reader *r, char *buffer, size_t size)
2409 read_bytes (r, buffer, size - 1);
2410 buffer[size - 1] = '\0';
2413 /* Skips BYTES bytes forward in R. */
2415 skip_bytes (struct sfm_reader *r, size_t bytes)
2420 size_t chunk = MIN (sizeof buffer, bytes);
2421 read_bytes (r, buffer, chunk);
2426 static const struct casereader_class sys_file_casereader_class =
2428 sys_file_casereader_read,
2429 sys_file_casereader_destroy,