1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
95 int missing_value_code;
101 struct sfm_value_label
107 struct sfm_value_label_record
110 struct sfm_value_label *labels;
117 struct sfm_document_record
124 struct sfm_extension_record
126 off_t pos; /* Starting offset in file. */
127 size_t size; /* Size of data elements. */
128 size_t count; /* Number of data elements. */
129 void *data; /* Contents. */
132 /* System file reader. */
135 /* Resource tracking. */
136 struct pool *pool; /* All system file state. */
137 jmp_buf bail_out; /* longjmp() target for error handling. */
140 struct file_handle *fh; /* File handle. */
141 struct fh_lock *lock; /* Mutual exclusion for file handle. */
142 FILE *file; /* File stream. */
143 off_t pos; /* Position in file. */
144 bool error; /* I/O or corruption error? */
145 struct caseproto *proto; /* Format of output cases. */
148 enum integer_format integer_format; /* On-disk integer format. */
149 enum float_format float_format; /* On-disk floating point format. */
150 struct sfm_var *sfm_vars; /* Variables. */
151 size_t sfm_var_cnt; /* Number of variables. */
152 casenumber case_cnt; /* Number of cases */
153 const char *encoding; /* String encoding. */
156 bool compressed; /* File is compressed? */
157 double bias; /* Compression bias, usually 100.0. */
158 uint8_t opcodes[8]; /* Current block of opcodes. */
159 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
160 bool corruption_warning; /* Warned about possible corruption? */
163 static const struct casereader_class sys_file_casereader_class;
165 static bool close_reader (struct sfm_reader *);
167 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
168 const struct sfm_var_record *,
171 static void sys_msg (struct sfm_reader *r, off_t, int class,
172 const char *format, va_list args)
173 PRINTF_FORMAT (4, 0);
174 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
175 PRINTF_FORMAT (3, 4);
176 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
180 static void read_bytes (struct sfm_reader *, void *, size_t);
181 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
182 static int read_int (struct sfm_reader *);
183 static double read_float (struct sfm_reader *);
184 static void read_string (struct sfm_reader *, char *, size_t);
185 static void skip_bytes (struct sfm_reader *, size_t);
187 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
188 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
190 static void read_variable_record (struct sfm_reader *,
191 struct sfm_var_record *);
192 static void read_value_label_record (struct sfm_reader *,
193 struct sfm_value_label_record *,
195 static struct sfm_document_record *read_document_record (struct sfm_reader *);
196 static struct sfm_extension_record *read_extension_record (
197 struct sfm_reader *, int subtype);
198 static void skip_extension_record (struct sfm_reader *, int subtype);
200 static const char *choose_encoding (
202 const struct sfm_extension_record *ext_integer,
203 const struct sfm_extension_record *ext_encoding);
205 static struct text_record *open_text_record (
206 struct sfm_reader *, const struct sfm_extension_record *);
207 static void close_text_record (struct sfm_reader *,
208 struct text_record *);
209 static bool read_variable_to_value_pair (struct sfm_reader *,
211 struct text_record *,
212 struct variable **var, char **value);
213 static void text_warn (struct sfm_reader *r, struct text_record *text,
214 const char *format, ...)
215 PRINTF_FORMAT (3, 4);
216 static char *text_get_token (struct text_record *,
217 struct substring delimiters, char *delimiter);
218 static bool text_match (struct text_record *, char c);
219 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
220 struct text_record *,
221 struct substring delimiters,
223 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
224 struct text_record *,
225 struct substring delimiters,
227 static const char *text_parse_counted_string (struct sfm_reader *,
228 struct text_record *);
229 static size_t text_pos (const struct text_record *);
231 static bool close_reader (struct sfm_reader *r);
233 /* Dictionary reader. */
241 static void read_header (struct sfm_reader *, int *weight_idx,
242 int *claimed_oct_cnt, struct sfm_read_info *,
244 static void parse_file_label (struct sfm_reader *, const char *file_label,
245 struct dictionary *);
246 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
247 struct sfm_var_record *, size_t n);
248 static void parse_format_spec (struct sfm_reader *, off_t pos,
249 unsigned int format, enum which_format,
250 struct variable *, int *format_warning_cnt);
251 static void parse_document (struct dictionary *, struct sfm_document_record *);
252 static void parse_display_parameters (struct sfm_reader *,
253 const struct sfm_extension_record *,
254 struct dictionary *);
255 static void parse_machine_integer_info (struct sfm_reader *,
256 const struct sfm_extension_record *,
257 struct sfm_read_info *);
258 static void parse_machine_float_info (struct sfm_reader *,
259 const struct sfm_extension_record *);
260 static void parse_mrsets (struct sfm_reader *,
261 const struct sfm_extension_record *,
262 struct dictionary *);
263 static void parse_long_var_name_map (struct sfm_reader *,
264 const struct sfm_extension_record *,
265 struct dictionary *);
266 static void parse_long_string_map (struct sfm_reader *,
267 const struct sfm_extension_record *,
268 struct dictionary *);
269 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
270 const struct sfm_var_record *,
272 const struct sfm_value_label_record *);
273 static void parse_data_file_attributes (struct sfm_reader *,
274 const struct sfm_extension_record *,
275 struct dictionary *);
276 static void parse_variable_attributes (struct sfm_reader *,
277 const struct sfm_extension_record *,
278 struct dictionary *);
279 static void parse_long_string_value_labels (struct sfm_reader *,
280 const struct sfm_extension_record *,
281 struct dictionary *);
283 /* Opens the system file designated by file handle FH for
284 reading. Reads the system file's dictionary into *DICT.
285 If INFO is non-null, then it receives additional info about the
288 sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
289 struct sfm_read_info *volatile info)
291 struct sfm_reader *volatile r = NULL;
292 struct sfm_read_info local_info;
294 struct sfm_var_record *vars;
295 size_t n_vars, allocated_vars;
297 struct sfm_value_label_record *labels;
298 size_t n_labels, allocated_labels;
300 struct sfm_document_record *document;
302 struct sfm_extension_record *extensions[32];
308 struct dictionary *dict = NULL;
311 /* Create and initialize reader. */
312 r = pool_create_container (struct sfm_reader, pool);
318 r->opcode_idx = sizeof r->opcodes;
319 r->corruption_warning = false;
321 /* TRANSLATORS: this fragment will be interpolated into
322 messages in fh_lock() that identify types of files. */
323 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
327 r->file = fn_open (fh_get_file_name (fh), "rb");
330 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
331 fh_get_file_name (r->fh), strerror (errno));
335 /* Initialize info. */
338 memset (info, 0, sizeof *info);
340 if (setjmp (r->bail_out))
344 read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
347 n_vars = allocated_vars = 0;
350 n_labels = allocated_labels = 0;
354 memset (extensions, 0, sizeof extensions);
364 read_int (r); /* Skip filler. */
371 if (n_vars >= allocated_vars)
372 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
374 read_variable_record (r, &vars[n_vars++]);
378 if (n_labels >= allocated_labels)
379 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
381 read_value_label_record (r, &labels[n_labels++], n_vars);
385 /* A Type 4 record is always immediately after a type 3 record,
386 so the code for type 3 records reads the type 4 record too. */
387 sys_error (r, r->pos, _("Misplaced type 4 record."));
390 if (document != NULL)
391 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
392 document = read_document_record (r);
396 subtype = read_int (r);
397 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
400 _("Unrecognized record type 7, subtype %d. Please "
401 "send a copy of this file, and the syntax which "
402 "created it to %s."),
403 subtype, PACKAGE_BUGREPORT);
404 skip_extension_record (r, subtype);
406 else if (extensions[subtype] != NULL)
409 _("Record type 7, subtype %d found here has the same "
410 "type as the record found near offset 0x%llx. "
411 "Please send a copy of this file, and the syntax "
412 "which created it to %s."),
413 subtype, (long long int) extensions[subtype]->pos,
415 skip_extension_record (r, subtype);
418 extensions[subtype] = read_extension_record (r, subtype);
422 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
427 /* Now actually parse what we read.
429 First, figure out the correct character encoding, because this determines
430 how the rest of the header data is to be interpreted. */
431 dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
432 extensions[EXT_ENCODING]));
434 /* These records don't use variables at all. */
435 if (document != NULL)
436 parse_document (dict, document);
438 if (extensions[EXT_INTEGER] != NULL)
439 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
441 if (extensions[EXT_FLOAT] != NULL)
442 parse_machine_float_info (r, extensions[EXT_FLOAT]);
444 if (extensions[EXT_FILE_ATTRS] != NULL)
445 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
447 parse_file_label (r, file_label, dict);
449 /* Parse the variable records, the basis of almost everything else. */
450 parse_variable_records (r, dict, vars, n_vars);
452 /* Parse value labels and the weight variable immediately after the variable
453 records. These records use indexes into var_recs[], so we must parse them
454 before those indexes become invalidated by very long string variables. */
455 for (i = 0; i < n_labels; i++)
456 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
459 struct variable *weight_var;
461 weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
462 if (var_is_numeric (weight_var))
463 dict_set_weight (dict, weight_var);
465 sys_error (r, -1, _("Weighting variable must be numeric "
466 "(not string variable `%s')."),
467 var_get_name (weight_var));
470 if (extensions[EXT_DISPLAY] != NULL)
471 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
473 /* The following records use short names, so they need to be parsed before
474 parse_long_var_name_map() changes short names to long names. */
475 if (extensions[EXT_MRSETS] != NULL)
476 parse_mrsets (r, extensions[EXT_MRSETS], dict);
478 if (extensions[EXT_MRSETS2] != NULL)
479 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
481 if (extensions[EXT_LONG_STRINGS] != NULL)
482 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
484 /* Now rename variables to their long names. */
485 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
487 /* The following records use long names, so they need to follow renaming. */
488 if (extensions[EXT_VAR_ATTRS] != NULL)
489 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
491 if (extensions[EXT_LONG_LABELS] != NULL)
492 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
494 /* Warn if the actual amount of data per case differs from the
495 amount that the header claims. SPSS version 13 gets this
496 wrong when very long strings are involved, so don't warn in
498 if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
499 && info->version_major != 13)
500 sys_warn (r, -1, _("File header claims %d variable positions but "
501 "%d were read from file."),
502 claimed_oct_cnt, n_vars);
504 /* Create an index of dictionary variable widths for
505 sfm_read_case to use. We cannot use the `struct variable's
506 from the dictionary we created, because the caller owns the
507 dictionary and may destroy or modify its variables. */
508 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
509 pool_register (r->pool, free, r->sfm_vars);
510 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
513 return casereader_create_sequential
515 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
516 &sys_file_casereader_class, r);
525 /* Closes a system file after we're done with it.
526 Returns true if an I/O error has occurred on READER, false
529 close_reader (struct sfm_reader *r)
538 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
540 msg (ME, _("Error closing system file `%s': %s."),
541 fh_get_file_name (r->fh), strerror (errno));
551 pool_destroy (r->pool);
556 /* Destroys READER. */
558 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
560 struct sfm_reader *r = r_;
564 /* Returns true if FILE is an SPSS system file,
567 sfm_detect (FILE *file)
571 if (fread (rec_type, 4, 1, file) != 1)
575 return !strcmp ("$FL2", rec_type);
578 /* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the
579 system file is unweighted, or to the value index of the weight variable
580 otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
581 case that the file claims to have (although it is not always correct).
582 Initializes INFO with header information. Stores the file label as a string
583 in dictionary encoding into *FILE_LABELP. */
585 read_header (struct sfm_reader *r, int *weight_idx,
586 int *claimed_oct_cnt, struct sfm_read_info *info,
590 char eye_catcher[61];
591 uint8_t raw_layout_code[4];
593 char creation_date[10];
594 char creation_time[9];
596 struct substring product;
598 read_string (r, rec_type, sizeof rec_type);
599 read_string (r, eye_catcher, sizeof eye_catcher);
601 if (strcmp ("$FL2", rec_type) != 0)
602 sys_error (r, 0, _("This is not an SPSS system file."));
604 /* Identify integer format. */
605 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
606 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
608 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
610 || (r->integer_format != INTEGER_MSB_FIRST
611 && r->integer_format != INTEGER_LSB_FIRST))
612 sys_error (r, 64, _("This is not an SPSS system file."));
614 *claimed_oct_cnt = read_int (r);
615 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
616 *claimed_oct_cnt = -1;
618 r->compressed = read_int (r) != 0;
620 *weight_idx = read_int (r);
622 r->case_cnt = read_int (r);
623 if ( r->case_cnt > INT_MAX / 2)
626 /* Identify floating-point format and obtain compression bias. */
627 read_bytes (r, raw_bias, sizeof raw_bias);
628 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
630 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
632 if (memcmp (raw_bias, zero_bias, 8))
633 sys_warn (r, r->pos - 8,
634 _("Compression bias is not the usual "
635 "value of 100, or system file uses unrecognized "
636 "floating-point format."));
639 /* Some software is known to write all-zeros to this
640 field. Such software also writes floating-point
641 numbers in the format that we expect by default
642 (it seems that all software most likely does, in
643 reality), so don't warn in this case. */
646 if (r->integer_format == INTEGER_MSB_FIRST)
647 r->float_format = FLOAT_IEEE_DOUBLE_BE;
649 r->float_format = FLOAT_IEEE_DOUBLE_LE;
651 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
653 read_string (r, creation_date, sizeof creation_date);
654 read_string (r, creation_time, sizeof creation_time);
655 read_string (r, file_label, sizeof file_label);
658 strcpy (info->creation_date, creation_date);
659 strcpy (info->creation_time, creation_time);
660 info->integer_format = r->integer_format;
661 info->float_format = r->float_format;
662 info->compressed = r->compressed;
663 info->case_cnt = r->case_cnt;
665 product = ss_cstr (eye_catcher);
666 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
667 ss_trim (&product, ss_cstr (" "));
668 str_copy_buf_trunc (info->product, sizeof info->product,
669 ss_data (product), ss_length (product));
671 *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
674 /* Reads a variable (type 2) record from R into RECORD. */
676 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
678 int has_variable_label;
680 memset (record, 0, sizeof *record);
682 record->pos = r->pos;
683 record->width = read_int (r);
684 has_variable_label = read_int (r);
685 record->missing_value_code = read_int (r);
686 record->print_format = read_int (r);
687 record->write_format = read_int (r);
688 read_bytes (r, record->name, sizeof record->name);
690 if (has_variable_label == 1)
692 enum { MAX_LABEL_LEN = 255 };
693 size_t len, read_len;
697 /* Read up to MAX_LABEL_LEN bytes of label. */
698 read_len = MIN (MAX_LABEL_LEN, len);
699 record->label = xmalloc (read_len + 1);
700 read_string (r, record->label, read_len + 1);
702 /* Skip unread label bytes. */
703 skip_bytes (r, len - read_len);
705 /* Skip label padding up to multiple of 4 bytes. */
706 skip_bytes (r, ROUND_UP (len, 4) - len);
708 else if (has_variable_label != 0)
709 sys_error (r, record->pos,
710 _("Variable label indicator field is not 0 or 1."));
712 /* Set missing values. */
713 if (record->missing_value_code != 0)
715 int code = record->missing_value_code;
716 if (record->width == 0)
718 if (code < -3 || code > 3 || code == -1)
719 sys_error (r, record->pos,
720 _("Numeric missing value indicator field is not "
721 "-3, -2, 0, 1, 2, or 3."));
725 if (code < 1 || code > 3)
726 sys_error (r, record->pos,
727 _("String missing value indicator field is not "
731 read_bytes (r, record->missing, 8 * abs (code));
735 /* Reads value labels from R into RECORD. */
737 read_value_label_record (struct sfm_reader *r,
738 struct sfm_value_label_record *record,
743 /* Read type 3 record. */
744 record->pos = r->pos;
745 record->n_labels = read_int (r);
746 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
747 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
749 record->labels = pool_nmalloc (r->pool, record->n_labels,
750 sizeof *record->labels);
751 for (i = 0; i < record->n_labels; i++)
753 struct sfm_value_label *label = &record->labels[i];
754 unsigned char label_len;
757 read_bytes (r, label->value, sizeof label->value);
759 /* Read label length. */
760 read_bytes (r, &label_len, sizeof label_len);
761 padded_len = ROUND_UP (label_len + 1, 8);
763 /* Read label, padding. */
764 label->label = pool_malloc (r->pool, padded_len + 1);
765 read_bytes (r, label->label, padded_len - 1);
766 label->label[label_len] = '\0';
769 /* Read record type of type 4 record. */
770 if (read_int (r) != 4)
771 sys_error (r, r->pos - 4,
772 _("Variable index record (type 4) does not immediately "
773 "follow value label record (type 3) as it should."));
775 /* Read number of variables associated with value label from type 4
777 record->n_vars = read_int (r);
778 if (record->n_vars < 1 || record->n_vars > n_vars)
779 sys_error (r, r->pos - 4,
780 _("Number of variables associated with a value label (%d) "
781 "is not between 1 and the number of variables (%zu)."),
782 record->n_vars, n_vars);
783 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
784 for (i = 0; i < record->n_vars; i++)
785 record->vars[i] = read_int (r);
788 /* Reads a document record from R and returns it. */
789 static struct sfm_document_record *
790 read_document_record (struct sfm_reader *r)
792 struct sfm_document_record *record;
795 record = pool_malloc (r->pool, sizeof *record);
796 record->pos = r->pos;
798 n_lines = read_int (r);
799 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
800 sys_error (r, record->pos,
801 _("Number of document lines (%d) "
802 "must be greater than 0 and less than %d."),
803 n_lines, INT_MAX / DOC_LINE_LENGTH);
805 record->n_lines = n_lines;
806 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
807 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
813 read_extension_record_header (struct sfm_reader *r, int subtype,
814 struct sfm_extension_record *record)
816 record->pos = r->pos;
817 record->size = read_int (r);
818 record->count = read_int (r);
820 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
821 allows an extra byte for a null terminator, used by some
822 extension processing routines. */
823 if (record->size != 0
824 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
825 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
828 /* Reads an extension record from R into RECORD. */
829 static struct sfm_extension_record *
830 read_extension_record (struct sfm_reader *r, int subtype)
832 struct extension_record_type
839 static const struct extension_record_type types[] =
841 /* Implemented record types. */
842 { EXT_INTEGER, 4, 8 },
844 { EXT_MRSETS, 1, 0 },
845 { EXT_DISPLAY, 4, 0 },
846 { EXT_LONG_NAMES, 1, 0 },
847 { EXT_LONG_STRINGS, 1, 0 },
848 { EXT_NCASES, 8, 2 },
849 { EXT_FILE_ATTRS, 1, 0 },
850 { EXT_VAR_ATTRS, 1, 0 },
851 { EXT_MRSETS2, 1, 0 },
852 { EXT_ENCODING, 1, 0 },
853 { EXT_LONG_LABELS, 1, 0 },
855 /* Ignored record types. */
856 { EXT_VAR_SETS, 0, 0 },
858 { EXT_DATA_ENTRY, 0, 0 },
861 const struct extension_record_type *type;
862 struct sfm_extension_record *record;
865 record = pool_malloc (r->pool, sizeof *record);
866 read_extension_record_header (r, subtype, record);
867 n_bytes = record->count * record->size;
869 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
870 if (subtype == type->subtype)
872 if (type->size > 0 && record->size != type->size)
873 sys_warn (r, record->pos,
874 _("Record type 7, subtype %d has bad size %zu "
875 "(expected %d)."), subtype, record->size, type->size);
876 else if (type->count > 0 && record->count != type->count)
877 sys_warn (r, record->pos,
878 _("Record type 7, subtype %d has bad count %zu "
879 "(expected %d)."), subtype, record->count, type->count);
880 else if (type->count == 0 && type->size == 0)
882 /* Ignore this record. */
886 char *data = pool_malloc (r->pool, n_bytes + 1);
887 data[n_bytes] = '\0';
890 read_bytes (r, record->data, n_bytes);
897 sys_warn (r, record->pos,
898 _("Unrecognized record type 7, subtype %d. Please send a "
899 "copy of this file, and the syntax which created it to %s."),
900 subtype, PACKAGE_BUGREPORT);
903 skip_bytes (r, n_bytes);
908 skip_extension_record (struct sfm_reader *r, int subtype)
910 struct sfm_extension_record record;
912 read_extension_record_header (r, subtype, &record);
913 skip_bytes (r, record.count * record.size);
917 parse_file_label (struct sfm_reader *r, const char *file_label,
918 struct dictionary *dict)
920 char *utf8_file_label;
921 size_t file_label_len;
923 utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
924 file_label, -1, r->pool);
925 file_label_len = strlen (utf8_file_label);
926 while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
928 utf8_file_label[file_label_len] = '\0';
929 dict_set_label (dict, utf8_file_label);
932 /* Reads a variable (type 2) record from R and adds the
933 corresponding variable to DICT.
934 Also skips past additional variable records for long string
937 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
938 struct sfm_var_record *var_recs, size_t n_var_recs)
940 const char *dict_encoding = dict_get_encoding (dict);
941 struct sfm_var_record *rec;
944 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
946 struct variable *var;
951 name = recode_string_pool ("UTF-8", dict_encoding,
952 rec->name, 8, r->pool);
953 name[strcspn (name, " ")] = '\0';
955 if (!dict_id_is_valid (dict, name, false)
956 || name[0] == '$' || name[0] == '#')
957 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
959 if (rec->width < 0 || rec->width > 255)
960 sys_error (r, rec->pos,
961 _("Bad width %d for variable %s."), rec->width, name);
963 var = rec->var = dict_create_var (dict, name, rec->width);
965 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
967 /* Set the short name the same as the long name. */
968 var_set_short_name (var, 0, name);
970 /* Get variable label, if any. */
975 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
976 rec->label, -1, r->pool);
977 var_set_label (var, utf8_label, NULL, false);
980 /* Set missing values. */
981 if (rec->missing_value_code != 0)
983 int width = var_get_width (var);
984 struct missing_values mv;
986 mv_init_pool (r->pool, &mv, width);
987 if (var_is_numeric (var))
989 bool has_range = rec->missing_value_code < 0;
990 int n_discrete = (has_range
991 ? rec->missing_value_code == -3
992 : rec->missing_value_code);
997 double low = parse_float (r, rec->missing, 0);
998 double high = parse_float (r, rec->missing, 8);
999 mv_add_range (&mv, low, high);
1003 for (i = 0; i < n_discrete; i++)
1005 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1013 value_init_pool (r->pool, &value, width);
1014 value_set_missing (&value, width);
1015 for (i = 0; i < rec->missing_value_code; i++)
1017 uint8_t *s = value_str_rw (&value, width);
1018 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1019 mv_add_str (&mv, s);
1022 var_set_missing_values (var, &mv);
1026 parse_format_spec (r, rec->pos + 12, rec->print_format,
1027 PRINT_FORMAT, var, &n_warnings);
1028 parse_format_spec (r, rec->pos + 16, rec->write_format,
1029 WRITE_FORMAT, var, &n_warnings);
1031 /* Account for values.
1032 Skip long string continuation records, if any. */
1033 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1034 for (i = 1; i < n_values; i++)
1035 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1036 sys_error (r, rec->pos, _("Missing string continuation record."));
1041 /* Translates the format spec from sysfile format to internal
1044 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1045 enum which_format which, struct variable *v,
1048 const int max_warnings = 8;
1049 uint8_t raw_type = format >> 16;
1050 uint8_t w = format >> 8;
1056 if (!fmt_from_io (raw_type, &f.type))
1057 sys_error (r, pos, _("Unknown variable format %"PRIu8"."), raw_type);
1062 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
1067 if (which == PRINT_FORMAT)
1068 var_set_print_format (v, &f);
1070 var_set_write_format (v, &f);
1072 else if (++*n_warnings <= max_warnings)
1074 char fmt_string[FMT_STRING_LEN_MAX + 1];
1075 sys_warn (r, pos, _("%s variable %s has invalid %s format %s."),
1076 var_is_numeric (v) ? _("Numeric") : _("String"),
1078 which == PRINT_FORMAT ? _("print") : _("write"),
1079 fmt_to_string (&f, fmt_string));
1081 if (*n_warnings == max_warnings)
1082 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1087 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1091 for (p = record->documents;
1092 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1093 p += DOC_LINE_LENGTH)
1095 struct substring line;
1097 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1098 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1099 ss_rtrim (&line, ss_cstr (" "));
1100 line.string[line.length] = '\0';
1102 dict_add_document_line (dict, line.string, false);
1108 /* Parses record type 7, subtype 3. */
1110 parse_machine_integer_info (struct sfm_reader *r,
1111 const struct sfm_extension_record *record,
1112 struct sfm_read_info *info)
1114 int float_representation, expected_float_format;
1115 int integer_representation, expected_integer_format;
1117 /* Save version info. */
1118 info->version_major = parse_int (r, record->data, 0);
1119 info->version_minor = parse_int (r, record->data, 4);
1120 info->version_revision = parse_int (r, record->data, 8);
1122 /* Check floating point format. */
1123 float_representation = parse_int (r, record->data, 16);
1124 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1125 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1126 expected_float_format = 1;
1127 else if (r->float_format == FLOAT_Z_LONG)
1128 expected_float_format = 2;
1129 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1130 expected_float_format = 3;
1133 if (float_representation != expected_float_format)
1134 sys_error (r, record->pos, _("Floating-point representation indicated by "
1135 "system file (%d) differs from expected (%d)."),
1136 float_representation, expected_float_format);
1138 /* Check integer format. */
1139 integer_representation = parse_int (r, record->data, 24);
1140 if (r->integer_format == INTEGER_MSB_FIRST)
1141 expected_integer_format = 1;
1142 else if (r->integer_format == INTEGER_LSB_FIRST)
1143 expected_integer_format = 2;
1146 if (integer_representation != expected_integer_format)
1147 sys_warn (r, record->pos,
1148 _("Integer format indicated by system file (%d) "
1149 "differs from expected (%d)."),
1150 integer_representation, expected_integer_format);
1155 choose_encoding (struct sfm_reader *r,
1156 const struct sfm_extension_record *ext_integer,
1157 const struct sfm_extension_record *ext_encoding)
1159 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1162 return ext_encoding->data;
1164 /* But EXT_INTEGER is better than nothing as a fallback. */
1167 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1176 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1177 respectively. However, there are known to be many files in the wild
1178 with character code 2, yet have data which are clearly not ASCII.
1179 Therefore we ignore these values. */
1192 return pool_asprintf (r->pool, "CP%d", codepage);
1196 return locale_charset ();
1199 /* Parses record type 7, subtype 4. */
1201 parse_machine_float_info (struct sfm_reader *r,
1202 const struct sfm_extension_record *record)
1204 double sysmis = parse_float (r, record->data, 0);
1205 double highest = parse_float (r, record->data, 8);
1206 double lowest = parse_float (r, record->data, 16);
1208 if (sysmis != SYSMIS)
1209 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1212 if (highest != HIGHEST)
1213 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1214 highest, "HIGHEST");
1216 if (lowest != LOWEST)
1217 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1221 /* Parses record type 7, subtype 7 or 19. */
1223 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1224 struct dictionary *dict)
1226 struct text_record *text;
1227 struct mrset *mrset;
1229 text = open_text_record (r, record);
1232 const char *counted = NULL;
1235 struct stringi_set var_names;
1236 size_t allocated_vars;
1240 mrset = xzalloc (sizeof *mrset);
1242 name = text_get_token (text, ss_cstr ("="), NULL);
1245 mrset->name = xstrdup (name);
1247 if (mrset->name[0] != '$')
1249 sys_warn (r, record->pos,
1250 _("`%s' does not begin with `$' at UTF-8 offset %zu "
1251 "in MRSETS record."), mrset->name, text_pos (text));
1255 if (text_match (text, 'C'))
1257 mrset->type = MRSET_MC;
1258 if (!text_match (text, ' '))
1260 sys_warn (r, record->pos,
1261 _("Missing space following `%c' at UTF-8 offset %zu "
1262 "in MRSETS record."), 'C', text_pos (text));
1266 else if (text_match (text, 'D'))
1268 mrset->type = MRSET_MD;
1269 mrset->cat_source = MRSET_VARLABELS;
1271 else if (text_match (text, 'E'))
1275 mrset->type = MRSET_MD;
1276 mrset->cat_source = MRSET_COUNTEDVALUES;
1277 if (!text_match (text, ' '))
1279 sys_warn (r, record->pos,
1280 _("Missing space following `%c' at UTF-8 offset %zu "
1281 "in MRSETS record."), 'E', text_pos (text));
1285 number = text_get_token (text, ss_cstr (" "), NULL);
1286 if (!strcmp (number, "11"))
1287 mrset->label_from_var_label = true;
1288 else if (strcmp (number, "1"))
1289 sys_warn (r, record->pos,
1290 _("Unexpected label source value `%s' following `E' "
1291 "at UTF-8 offset %zu in MRSETS record."),
1292 number, text_pos (text));
1296 sys_warn (r, record->pos,
1297 _("Missing `C', `D', or `E' at UTF-8 offset %zu "
1298 "in MRSETS record."),
1303 if (mrset->type == MRSET_MD)
1305 counted = text_parse_counted_string (r, text);
1306 if (counted == NULL)
1310 label = text_parse_counted_string (r, text);
1313 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1315 stringi_set_init (&var_names);
1320 struct variable *var;
1321 const char *var_name;
1323 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1324 if (var_name == NULL)
1326 sys_warn (r, record->pos,
1327 _("Missing new-line parsing variable names "
1328 "at UTF-8 offset %zu in MRSETS record."),
1333 var = dict_lookup_var (dict, var_name);
1336 if (!stringi_set_insert (&var_names, var_name))
1338 sys_warn (r, record->pos,
1339 _("Duplicate variable name %s "
1340 "at UTF-8 offset %zu in MRSETS record."),
1341 var_name, text_pos (text));
1345 if (mrset->label == NULL && mrset->label_from_var_label
1346 && var_has_label (var))
1347 mrset->label = xstrdup (var_get_label (var));
1350 && var_get_type (var) != var_get_type (mrset->vars[0]))
1352 sys_warn (r, record->pos,
1353 _("MRSET %s contains both string and "
1354 "numeric variables."), name);
1357 width = MIN (width, var_get_width (var));
1359 if (mrset->n_vars >= allocated_vars)
1360 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1361 sizeof *mrset->vars);
1362 mrset->vars[mrset->n_vars++] = var;
1364 while (delimiter != '\n');
1366 if (mrset->n_vars < 2)
1368 sys_warn (r, record->pos,
1369 _("MRSET %s has only %zu variables."), mrset->name,
1371 mrset_destroy (mrset);
1375 if (mrset->type == MRSET_MD)
1377 mrset->width = width;
1378 value_init (&mrset->counted, width);
1380 mrset->counted.f = strtod (counted, NULL);
1382 value_copy_str_rpad (&mrset->counted, width,
1383 (const uint8_t *) counted, ' ');
1386 dict_add_mrset (dict, mrset);
1388 stringi_set_destroy (&var_names);
1390 mrset_destroy (mrset);
1391 close_text_record (r, text);
1394 /* Read record type 7, subtype 11, which specifies how variables
1395 should be displayed in GUI environments. */
1397 parse_display_parameters (struct sfm_reader *r,
1398 const struct sfm_extension_record *record,
1399 struct dictionary *dict)
1401 bool includes_width;
1402 bool warned = false;
1407 n_vars = dict_get_var_cnt (dict);
1408 if (record->count == 3 * n_vars)
1409 includes_width = true;
1410 else if (record->count == 2 * n_vars)
1411 includes_width = false;
1414 sys_warn (r, record->pos,
1415 _("Extension 11 has bad count %zu (for %zu variables)."),
1416 record->count, n_vars);
1421 for (i = 0; i < n_vars; ++i)
1423 struct variable *v = dict_get_var (dict, i);
1424 int measure, width, align;
1426 measure = parse_int (r, record->data, ofs);
1431 width = parse_int (r, record->data, ofs);
1437 align = parse_int (r, record->data, ofs);
1440 /* SPSS 14 sometimes seems to set string variables' measure
1442 if (0 == measure && var_is_alpha (v))
1445 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1448 sys_warn (r, record->pos,
1449 _("Invalid variable display parameters for variable "
1450 "%zu (%s). Default parameters substituted."),
1451 i, var_get_name (v));
1456 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1457 : measure == 2 ? MEASURE_ORDINAL
1459 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1460 : align == 1 ? ALIGN_RIGHT
1463 /* Older versions (SPSS 9.0) sometimes set the display
1464 width to zero. This causes confusion in the GUI, so
1465 only set the width if it is nonzero. */
1467 var_set_display_width (v, width);
1472 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1473 const char *new_name)
1475 size_t n_short_names;
1479 /* Renaming a variable may clear its short names, but we
1480 want to retain them, so we save them and re-set them
1482 n_short_names = var_get_short_name_cnt (var);
1483 short_names = xnmalloc (n_short_names, sizeof *short_names);
1484 for (i = 0; i < n_short_names; i++)
1486 const char *s = var_get_short_name (var, i);
1487 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1490 /* Set long name. */
1491 dict_rename_var (dict, var, new_name);
1493 /* Restore short names. */
1494 for (i = 0; i < n_short_names; i++)
1496 var_set_short_name (var, i, short_names[i]);
1497 free (short_names[i]);
1502 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1503 to each short name. Modifies variable names in DICT accordingly. */
1505 parse_long_var_name_map (struct sfm_reader *r,
1506 const struct sfm_extension_record *record,
1507 struct dictionary *dict)
1509 struct text_record *text;
1510 struct variable *var;
1515 /* Convert variable names to lowercase. */
1518 for (i = 0; i < dict_get_var_cnt (dict); i++)
1520 struct variable *var = dict_get_var (dict, i);
1523 new_name = xstrdup (var_get_name (var));
1524 str_lowercase (new_name);
1526 rename_var_and_save_short_names (dict, var, new_name);
1534 /* Rename each of the variables, one by one. (In a correctly constructed
1535 system file, this cannot create any intermediate duplicate variable names,
1536 because all of the new variable names are longer than any of the old
1537 variable names and thus there cannot be any overlaps.) */
1538 text = open_text_record (r, record);
1539 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1541 /* Validate long name. */
1542 /* XXX need to reencode name to UTF-8 */
1543 if (!dict_id_is_valid (dict, long_name, false))
1545 sys_warn (r, record->pos,
1546 _("Long variable mapping from %s to invalid "
1547 "variable name `%s'."),
1548 var_get_name (var), long_name);
1552 /* Identify any duplicates. */
1553 if (strcasecmp (var_get_short_name (var, 0), long_name)
1554 && dict_lookup_var (dict, long_name) != NULL)
1556 sys_warn (r, record->pos,
1557 _("Duplicate long variable name `%s'."), long_name);
1561 rename_var_and_save_short_names (dict, var, long_name);
1563 close_text_record (r, text);
1566 /* Reads record type 7, subtype 14, which gives the real length
1567 of each very long string. Rearranges DICT accordingly. */
1569 parse_long_string_map (struct sfm_reader *r,
1570 const struct sfm_extension_record *record,
1571 struct dictionary *dict)
1573 struct text_record *text;
1574 struct variable *var;
1577 text = open_text_record (r, record);
1578 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1580 size_t idx = var_get_dict_index (var);
1586 length = strtol (length_s, NULL, 10);
1587 if (length < 1 || length > MAX_STRING)
1589 sys_warn (r, record->pos,
1590 _("%s listed as string of invalid length %s "
1591 "in very long string record."),
1592 var_get_name (var), length_s);
1596 /* Check segments. */
1597 segment_cnt = sfm_width_to_segments (length);
1598 if (segment_cnt == 1)
1600 sys_warn (r, record->pos,
1601 _("%s listed in very long string record with width %s, "
1602 "which requires only one segment."),
1603 var_get_name (var), length_s);
1606 if (idx + segment_cnt > dict_get_var_cnt (dict))
1607 sys_error (r, record->pos,
1608 _("Very long string %s overflows dictionary."),
1609 var_get_name (var));
1611 /* Get the short names from the segments and check their
1613 for (i = 0; i < segment_cnt; i++)
1615 struct variable *seg = dict_get_var (dict, idx + i);
1616 int alloc_width = sfm_segment_alloc_width (length, i);
1617 int width = var_get_width (seg);
1620 var_set_short_name (var, i, var_get_short_name (seg, 0));
1621 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1622 sys_error (r, record->pos,
1623 _("Very long string with width %ld has segment %d "
1624 "of width %d (expected %d)."),
1625 length, i, width, alloc_width);
1627 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1628 var_set_width (var, length);
1630 close_text_record (r, text);
1631 dict_compact_values (dict);
1635 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1636 const struct sfm_var_record *var_recs, size_t n_var_recs,
1637 const struct sfm_value_label_record *record)
1639 struct variable **vars;
1643 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1644 for (i = 0; i < record->n_labels; i++)
1645 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1646 record->labels[i].label, -1,
1649 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1650 for (i = 0; i < record->n_vars; i++)
1651 vars[i] = lookup_var_by_index (r, record->pos,
1652 var_recs, n_var_recs, record->vars[i]);
1654 for (i = 1; i < record->n_vars; i++)
1655 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1656 sys_error (r, record->pos,
1657 _("Variables associated with value label are not all of "
1658 "identical type. Variable %s is %s, but variable "
1660 var_get_name (vars[0]),
1661 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1662 var_get_name (vars[i]),
1663 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1665 for (i = 0; i < record->n_vars; i++)
1667 struct variable *var = vars[i];
1671 width = var_get_width (var);
1673 sys_error (r, record->pos,
1674 _("Value labels may not be added to long string "
1675 "variables (e.g. %s) using records types 3 and 4."),
1676 var_get_name (var));
1678 for (j = 0; j < record->n_labels; j++)
1680 struct sfm_value_label *label = &record->labels[j];
1683 value_init (&value, width);
1685 value.f = parse_float (r, label->value, 0);
1687 memcpy (value_str_rw (&value, width), label->value, width);
1689 if (!var_add_value_label (var, &value, utf8_labels[j]))
1691 if (var_is_numeric (var))
1692 sys_warn (r, record->pos,
1693 _("Duplicate value label for %g on %s."),
1694 value.f, var_get_name (var));
1696 sys_warn (r, record->pos,
1697 _("Duplicate value label for `%.*s' on %s."),
1698 width, value_str (&value, width),
1699 var_get_name (var));
1702 value_destroy (&value, width);
1706 pool_free (r->pool, vars);
1707 for (i = 0; i < record->n_labels; i++)
1708 pool_free (r->pool, utf8_labels[i]);
1709 pool_free (r->pool, utf8_labels);
1712 static struct variable *
1713 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1714 const struct sfm_var_record *var_recs, size_t n_var_recs,
1717 const struct sfm_var_record *rec;
1719 if (idx < 1 || idx > n_var_recs)
1721 sys_error (r, offset,
1722 _("Variable index %d not in valid range 1...%d."),
1727 rec = &var_recs[idx - 1];
1728 if (rec->var == NULL)
1730 sys_error (r, offset,
1731 _("Variable index %d refers to long string continuation."),
1739 /* Parses a set of custom attributes from TEXT into ATTRS.
1740 ATTRS may be a null pointer, in which case the attributes are
1741 read but discarded. */
1743 parse_attributes (struct sfm_reader *r, struct text_record *text,
1744 struct attrset *attrs)
1748 struct attribute *attr;
1752 /* Parse the key. */
1753 key = text_get_token (text, ss_cstr ("("), NULL);
1757 attr = attribute_create (key);
1758 for (index = 1; ; index++)
1760 /* Parse the value. */
1764 value = text_get_token (text, ss_cstr ("\n"), NULL);
1767 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1772 length = strlen (value);
1773 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1775 value[length - 1] = '\0';
1776 attribute_add_value (attr, value + 1);
1781 _("Attribute value %s[%d] is not quoted: %s."),
1783 attribute_add_value (attr, value);
1786 /* Was this the last value for this attribute? */
1787 if (text_match (text, ')'))
1791 attrset_add (attrs, attr);
1793 attribute_destroy (attr);
1795 while (!text_match (text, '/'));
1798 /* Reads record type 7, subtype 17, which lists custom
1799 attributes on the data file. */
1801 parse_data_file_attributes (struct sfm_reader *r,
1802 const struct sfm_extension_record *record,
1803 struct dictionary *dict)
1805 struct text_record *text = open_text_record (r, record);
1806 parse_attributes (r, text, dict_get_attributes (dict));
1807 close_text_record (r, text);
1810 /* Parses record type 7, subtype 18, which lists custom
1811 attributes on individual variables. */
1813 parse_variable_attributes (struct sfm_reader *r,
1814 const struct sfm_extension_record *record,
1815 struct dictionary *dict)
1817 struct text_record *text;
1818 struct variable *var;
1820 text = open_text_record (r, record);
1821 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1822 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1823 close_text_record (r, text);
1827 check_overflow (struct sfm_reader *r,
1828 const struct sfm_extension_record *record,
1829 size_t ofs, size_t length)
1831 size_t end = record->size * record->count;
1832 if (length >= end || ofs + length > end)
1833 sys_error (r, record->pos + end,
1834 _("Long string value label record ends unexpectedly."));
1838 parse_long_string_value_labels (struct sfm_reader *r,
1839 const struct sfm_extension_record *record,
1840 struct dictionary *dict)
1842 const char *dict_encoding = dict_get_encoding (dict);
1843 size_t end = record->size * record->count;
1850 struct variable *var;
1855 /* Parse variable name length. */
1856 check_overflow (r, record, ofs, 4);
1857 var_name_len = parse_int (r, record->data, ofs);
1860 /* Parse variable name, width, and number of labels. */
1861 check_overflow (r, record, ofs, var_name_len + 8);
1862 var_name = recode_string_pool ("UTF-8", dict_encoding,
1863 (const char *) record->data + ofs,
1864 var_name_len, r->pool);
1865 width = parse_int (r, record->data, ofs + var_name_len);
1866 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1867 ofs += var_name_len + 8;
1869 /* Look up 'var' and validate. */
1870 var = dict_lookup_var (dict, var_name);
1872 sys_warn (r, record->pos + ofs,
1873 _("Ignoring long string value record for "
1874 "unknown variable %s."), var_name);
1875 else if (var_is_numeric (var))
1877 sys_warn (r, record->pos + ofs,
1878 _("Ignoring long string value record for "
1879 "numeric variable %s."), var_name);
1882 else if (width != var_get_width (var))
1884 sys_warn (r, record->pos + ofs,
1885 _("Ignoring long string value record for variable %s "
1886 "because the record's width (%d) does not match the "
1887 "variable's width (%d)."),
1888 var_name, width, var_get_width (var));
1893 value_init_pool (r->pool, &value, width);
1894 for (i = 0; i < n_labels; i++)
1896 size_t value_length, label_length;
1897 bool skip = var == NULL;
1899 /* Parse value length. */
1900 check_overflow (r, record, ofs, 4);
1901 value_length = parse_int (r, record->data, ofs);
1905 check_overflow (r, record, ofs, value_length);
1908 if (value_length == width)
1909 memcpy (value_str_rw (&value, width),
1910 (const uint8_t *) record->data + ofs, width);
1913 sys_warn (r, record->pos + ofs,
1914 _("Ignoring long string value %zu for variable "
1915 "%s, with width %d, that has bad value "
1917 i, var_get_name (var), width, value_length);
1921 ofs += value_length;
1923 /* Parse label length. */
1924 check_overflow (r, record, ofs, 4);
1925 label_length = parse_int (r, record->data, ofs);
1929 check_overflow (r, record, ofs, label_length);
1934 label = recode_string_pool ("UTF-8", dict_encoding,
1935 (const char *) record->data + ofs,
1936 label_length, r->pool);
1937 if (!var_add_value_label (var, &value, label))
1938 sys_warn (r, record->pos + ofs,
1939 _("Duplicate value label for `%.*s' on %s."),
1940 width, value_str (&value, width),
1941 var_get_name (var));
1942 pool_free (r->pool, label);
1944 ofs += label_length;
1951 static void partial_record (struct sfm_reader *r)
1954 static void read_error (struct casereader *, const struct sfm_reader *);
1956 static bool read_case_number (struct sfm_reader *, double *);
1957 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1958 static int read_opcode (struct sfm_reader *);
1959 static bool read_compressed_number (struct sfm_reader *, double *);
1960 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1961 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1962 static bool skip_whole_strings (struct sfm_reader *, size_t);
1964 /* Reads and returns one case from READER's file. Returns a null
1965 pointer if not successful. */
1966 static struct ccase *
1967 sys_file_casereader_read (struct casereader *reader, void *r_)
1969 struct sfm_reader *r = r_;
1970 struct ccase *volatile c;
1976 c = case_create (r->proto);
1977 if (setjmp (r->bail_out))
1979 casereader_force_error (reader);
1984 for (i = 0; i < r->sfm_var_cnt; i++)
1986 struct sfm_var *sv = &r->sfm_vars[i];
1987 union value *v = case_data_rw_idx (c, sv->case_index);
1989 if (sv->var_width == 0)
1991 if (!read_case_number (r, &v->f))
1996 uint8_t *s = value_str_rw (v, sv->var_width);
1997 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1999 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2008 if (r->case_cnt != -1)
2009 read_error (reader, r);
2014 /* Issues an error that R ends in a partial record. */
2016 partial_record (struct sfm_reader *r)
2018 sys_error (r, r->pos, _("File ends in partial case."));
2021 /* Issues an error that an unspecified error occurred SFM, and
2024 read_error (struct casereader *r, const struct sfm_reader *sfm)
2026 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2027 casereader_force_error (r);
2030 /* Reads a number from R and stores its value in *D.
2031 If R is compressed, reads a compressed number;
2032 otherwise, reads a number in the regular way.
2033 Returns true if successful, false if end of file is
2034 reached immediately. */
2036 read_case_number (struct sfm_reader *r, double *d)
2041 if (!try_read_bytes (r, number, sizeof number))
2043 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2047 return read_compressed_number (r, d);
2050 /* Reads LENGTH string bytes from R into S.
2051 Always reads a multiple of 8 bytes; if LENGTH is not a
2052 multiple of 8, then extra bytes are read and discarded without
2054 Reads compressed strings if S is compressed.
2055 Returns true if successful, false if end of file is
2056 reached immediately. */
2058 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2060 size_t whole = ROUND_DOWN (length, 8);
2061 size_t partial = length % 8;
2065 if (!read_whole_strings (r, s, whole))
2072 if (!read_whole_strings (r, bounce, sizeof bounce))
2078 memcpy (s + whole, bounce, partial);
2084 /* Reads and returns the next compression opcode from R. */
2086 read_opcode (struct sfm_reader *r)
2088 assert (r->compressed);
2092 if (r->opcode_idx >= sizeof r->opcodes)
2094 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2098 opcode = r->opcodes[r->opcode_idx++];
2105 /* Reads a compressed number from R and stores its value in D.
2106 Returns true if successful, false if end of file is
2107 reached immediately. */
2109 read_compressed_number (struct sfm_reader *r, double *d)
2111 int opcode = read_opcode (r);
2119 *d = read_float (r);
2123 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2124 if (!r->corruption_warning)
2126 r->corruption_warning = true;
2127 sys_warn (r, r->pos,
2128 _("Possible compressed data corruption: "
2129 "compressed spaces appear in numeric field."));
2138 *d = opcode - r->bias;
2145 /* Reads a compressed 8-byte string segment from R and stores it
2147 Returns true if successful, false if end of file is
2148 reached immediately. */
2150 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2152 int opcode = read_opcode (r);
2160 read_bytes (r, dst, 8);
2164 memset (dst, ' ', 8);
2169 double value = opcode - r->bias;
2170 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2173 /* This has actually been seen "in the wild". The submitter of the
2174 file that showed that the contents decoded as spaces, but they
2175 were at the end of the field so it's possible that the null
2176 bytes just acted as null terminators. */
2178 else if (!r->corruption_warning)
2180 r->corruption_warning = true;
2181 sys_warn (r, r->pos,
2182 _("Possible compressed data corruption: "
2183 "string contains compressed integer (opcode %d)."),
2193 /* Reads LENGTH string bytes from R into S.
2194 LENGTH must be a multiple of 8.
2195 Reads compressed strings if S is compressed.
2196 Returns true if successful, false if end of file is
2197 reached immediately. */
2199 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2201 assert (length % 8 == 0);
2203 return try_read_bytes (r, s, length);
2207 for (ofs = 0; ofs < length; ofs += 8)
2208 if (!read_compressed_string (r, s + ofs))
2218 /* Skips LENGTH string bytes from R.
2219 LENGTH must be a multiple of 8.
2220 (LENGTH is also limited to 1024, but that's only because the
2221 current caller never needs more than that many bytes.)
2222 Returns true if successful, false if end of file is
2223 reached immediately. */
2225 skip_whole_strings (struct sfm_reader *r, size_t length)
2227 uint8_t buffer[1024];
2228 assert (length < sizeof buffer);
2229 return read_whole_strings (r, buffer, length);
2232 /* Helpers for reading records that contain structured text
2235 /* Maximum number of warnings to issue for a single text
2237 #define MAX_TEXT_WARNINGS 5
2242 struct substring buffer; /* Record contents, in UTF-8. */
2243 off_t start; /* Starting offset in file. */
2244 size_t pos; /* Current position in buffer. */
2245 int n_warnings; /* Number of warnings issued or suppressed. */
2248 static struct text_record *
2249 open_text_record (struct sfm_reader *r,
2250 const struct sfm_extension_record *record)
2252 struct text_record *text;
2253 struct substring raw;
2255 text = pool_alloc (r->pool, sizeof *text);
2256 raw = ss_buffer (record->data, record->size * record->count);
2257 text->start = record->pos;
2258 text->buffer = recode_substring_pool ("UTF-8", r->encoding, raw, r->pool);
2260 text->n_warnings = 0;
2265 /* Closes TEXT, frees its storage, and issues a final warning
2266 about suppressed warnings if necesary. */
2268 close_text_record (struct sfm_reader *r, struct text_record *text)
2270 if (text->n_warnings > MAX_TEXT_WARNINGS)
2271 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2272 text->n_warnings - MAX_TEXT_WARNINGS);
2273 pool_free (r->pool, ss_data (text->buffer));
2276 /* Reads a variable=value pair from TEXT.
2277 Looks up the variable in DICT and stores it into *VAR.
2278 Stores a null-terminated value into *VALUE. */
2280 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2281 struct text_record *text,
2282 struct variable **var, char **value)
2286 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2289 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2293 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2294 ss_buffer ("\t\0", 2));
2302 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2303 struct text_record *text, struct substring delimiters,
2304 struct variable **var)
2308 name = text_get_token (text, delimiters, NULL);
2312 *var = dict_lookup_var (dict, name);
2316 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2323 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2324 struct text_record *text, struct substring delimiters,
2325 struct variable **var)
2327 char *short_name = text_get_token (text, delimiters, NULL);
2328 if (short_name == NULL)
2331 *var = dict_lookup_var (dict, short_name);
2333 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2338 /* Displays a warning for the current file position, limiting the
2339 number to MAX_TEXT_WARNINGS for TEXT. */
2341 text_warn (struct sfm_reader *r, struct text_record *text,
2342 const char *format, ...)
2344 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2348 va_start (args, format);
2349 sys_msg (r, text->start + text->pos, MW, format, args);
2355 text_get_token (struct text_record *text, struct substring delimiters,
2358 struct substring token;
2361 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2364 end = &ss_data (token)[ss_length (token)];
2365 if (delimiter != NULL)
2368 return ss_data (token);
2371 /* Reads a integer value expressed in decimal, then a space, then a string that
2372 consists of exactly as many bytes as specified by the integer, then a space,
2373 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2374 buffer (so the caller should not free the string). */
2376 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2386 int c = text->buffer.string[text->pos];
2387 if (c < '0' || c > '9')
2389 n = (n * 10) + (c - '0');
2392 if (start == text->pos)
2394 sys_warn (r, text->start,
2395 _("Expecting digit at UTF-8 offset %zu in MRSETS record."),
2400 if (!text_match (text, ' '))
2402 sys_warn (r, text->start,
2403 _("Expecting space at UTF-8 offset %zu in MRSETS record."),
2408 if (text->pos + n > text->buffer.length)
2410 sys_warn (r, text->start,
2411 _("%zu-byte string starting at UTF-8 offset %zu "
2412 "exceeds record length %zu."),
2413 n, text->pos, text->buffer.length);
2417 s = &text->buffer.string[text->pos];
2420 sys_warn (r, text->start,
2421 _("Expecting space at UTF-8 offset %zu following %zu-byte "
2432 text_match (struct text_record *text, char c)
2434 if (text->buffer.string[text->pos] == c)
2443 /* Returns the current byte offset (as convertd to UTF-8) inside the TEXT's
2446 text_pos (const struct text_record *text)
2453 /* Displays a corruption message. */
2455 sys_msg (struct sfm_reader *r, off_t offset,
2456 int class, const char *format, va_list args)
2461 ds_init_empty (&text);
2463 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2464 fh_get_file_name (r->fh), (long long int) offset);
2466 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2467 ds_put_vformat (&text, format, args);
2469 m.category = msg_class_to_category (class);
2470 m.severity = msg_class_to_severity (class);
2476 m.text = ds_cstr (&text);
2481 /* Displays a warning for offset OFFSET in the file. */
2483 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2487 va_start (args, format);
2488 sys_msg (r, offset, MW, format, args);
2492 /* Displays an error for the current file position,
2493 marks it as in an error state,
2494 and aborts reading it using longjmp. */
2496 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2500 va_start (args, format);
2501 sys_msg (r, offset, ME, format, args);
2505 longjmp (r->bail_out, 1);
2508 /* Reads BYTE_CNT bytes into BUF.
2509 Returns true if exactly BYTE_CNT bytes are successfully read.
2510 Aborts if an I/O error or a partial read occurs.
2511 If EOF_IS_OK, then an immediate end-of-file causes false to be
2512 returned; otherwise, immediate end-of-file causes an abort
2515 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2516 void *buf, size_t byte_cnt)
2518 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2519 r->pos += bytes_read;
2520 if (bytes_read == byte_cnt)
2522 else if (ferror (r->file))
2523 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2524 else if (!eof_is_ok || bytes_read != 0)
2525 sys_error (r, r->pos, _("Unexpected end of file."));
2530 /* Reads BYTE_CNT into BUF.
2531 Aborts upon I/O error or if end-of-file is encountered. */
2533 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2535 read_bytes_internal (r, false, buf, byte_cnt);
2538 /* Reads BYTE_CNT bytes into BUF.
2539 Returns true if exactly BYTE_CNT bytes are successfully read.
2540 Returns false if an immediate end-of-file is encountered.
2541 Aborts if an I/O error or a partial read occurs. */
2543 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2545 return read_bytes_internal (r, true, buf, byte_cnt);
2548 /* Reads a 32-bit signed integer from R and returns its value in
2551 read_int (struct sfm_reader *r)
2554 read_bytes (r, integer, sizeof integer);
2555 return integer_get (r->integer_format, integer, sizeof integer);
2558 /* Reads a 64-bit floating-point number from R and returns its
2559 value in host format. */
2561 read_float (struct sfm_reader *r)
2564 read_bytes (r, number, sizeof number);
2565 return float_get_double (r->float_format, number);
2569 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2571 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2575 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2577 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2580 /* Reads exactly SIZE - 1 bytes into BUFFER
2581 and stores a null byte into BUFFER[SIZE - 1]. */
2583 read_string (struct sfm_reader *r, char *buffer, size_t size)
2586 read_bytes (r, buffer, size - 1);
2587 buffer[size - 1] = '\0';
2590 /* Skips BYTES bytes forward in R. */
2592 skip_bytes (struct sfm_reader *r, size_t bytes)
2597 size_t chunk = MIN (sizeof buffer, bytes);
2598 read_bytes (r, buffer, chunk);
2603 static const struct casereader_class sys_file_casereader_class =
2605 sys_file_casereader_read,
2606 sys_file_casereader_destroy,