1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
88 /* Fields from the top-level header record. */
89 struct sfm_header_record
91 int weight_idx; /* 0 if unweighted, otherwise a var index. */
92 int nominal_case_size; /* Number of var positions. */
94 /* These correspond to the members of struct sfm_file_info or a dictionary
95 but in the system file's encoding rather than ASCII. */
96 char creation_date[10]; /* "dd mmm yy". */
97 char creation_time[9]; /* "hh:mm:ss". */
98 char eye_catcher[61]; /* Eye-catcher string, then product name. */
99 char file_label[65]; /* File label. */
102 struct sfm_var_record
109 int missing_value_code;
112 struct variable *var;
115 struct sfm_value_label
121 struct sfm_value_label_record
124 struct sfm_value_label *labels;
131 struct sfm_document_record
138 struct sfm_extension_record
140 off_t pos; /* Starting offset in file. */
141 size_t size; /* Size of data elements. */
142 size_t count; /* Number of data elements. */
143 void *data; /* Contents. */
146 /* System file reader. */
149 /* Resource tracking. */
150 struct pool *pool; /* All system file state. */
151 jmp_buf bail_out; /* longjmp() target for error handling. */
154 struct file_handle *fh; /* File handle. */
155 struct fh_lock *lock; /* Mutual exclusion for file handle. */
156 FILE *file; /* File stream. */
157 off_t pos; /* Position in file. */
158 bool error; /* I/O or corruption error? */
159 struct caseproto *proto; /* Format of output cases. */
162 enum integer_format integer_format; /* On-disk integer format. */
163 enum float_format float_format; /* On-disk floating point format. */
164 struct sfm_var *sfm_vars; /* Variables. */
165 size_t sfm_var_cnt; /* Number of variables. */
166 casenumber case_cnt; /* Number of cases */
167 const char *encoding; /* String encoding. */
170 bool compressed; /* File is compressed? */
171 double bias; /* Compression bias, usually 100.0. */
172 uint8_t opcodes[8]; /* Current block of opcodes. */
173 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
174 bool corruption_warning; /* Warned about possible corruption? */
177 static const struct casereader_class sys_file_casereader_class;
179 static bool close_reader (struct sfm_reader *);
181 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
182 const struct sfm_var_record *,
185 static void sys_msg (struct sfm_reader *r, off_t, int class,
186 const char *format, va_list args)
187 PRINTF_FORMAT (4, 0);
188 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
189 PRINTF_FORMAT (3, 4);
190 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
194 static void read_bytes (struct sfm_reader *, void *, size_t);
195 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
196 static int read_int (struct sfm_reader *);
197 static double read_float (struct sfm_reader *);
198 static void read_string (struct sfm_reader *, char *, size_t);
199 static void skip_bytes (struct sfm_reader *, size_t);
201 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
202 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
204 static void read_variable_record (struct sfm_reader *,
205 struct sfm_var_record *);
206 static void read_value_label_record (struct sfm_reader *,
207 struct sfm_value_label_record *,
209 static struct sfm_document_record *read_document_record (struct sfm_reader *);
210 static struct sfm_extension_record *read_extension_record (
211 struct sfm_reader *, int subtype);
212 static void skip_extension_record (struct sfm_reader *, int subtype);
214 static const char *choose_encoding (
216 const struct sfm_extension_record *ext_integer,
217 const struct sfm_extension_record *ext_encoding);
219 static struct text_record *open_text_record (
220 struct sfm_reader *, const struct sfm_extension_record *,
221 bool recode_to_utf8);
222 static void close_text_record (struct sfm_reader *,
223 struct text_record *);
224 static bool read_variable_to_value_pair (struct sfm_reader *,
226 struct text_record *,
227 struct variable **var, char **value);
228 static void text_warn (struct sfm_reader *r, struct text_record *text,
229 const char *format, ...)
230 PRINTF_FORMAT (3, 4);
231 static char *text_get_token (struct text_record *,
232 struct substring delimiters, char *delimiter);
233 static bool text_match (struct text_record *, char c);
234 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
235 struct text_record *,
236 struct substring delimiters,
238 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
239 struct text_record *,
240 struct substring delimiters,
242 static const char *text_parse_counted_string (struct sfm_reader *,
243 struct text_record *);
244 static size_t text_pos (const struct text_record *);
246 static bool close_reader (struct sfm_reader *r);
248 /* Dictionary reader. */
256 static void read_header (struct sfm_reader *, struct sfm_read_info *,
257 struct sfm_header_record *);
258 static void parse_header (struct sfm_reader *,
259 const struct sfm_header_record *,
260 struct sfm_read_info *, struct dictionary *);
261 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
262 struct sfm_var_record *, size_t n);
263 static void parse_format_spec (struct sfm_reader *, off_t pos,
264 unsigned int format, enum which_format,
265 struct variable *, int *format_warning_cnt);
266 static void parse_document (struct dictionary *, struct sfm_document_record *);
267 static void parse_display_parameters (struct sfm_reader *,
268 const struct sfm_extension_record *,
269 struct dictionary *);
270 static void parse_machine_integer_info (struct sfm_reader *,
271 const struct sfm_extension_record *,
272 struct sfm_read_info *);
273 static void parse_machine_float_info (struct sfm_reader *,
274 const struct sfm_extension_record *);
275 static void parse_mrsets (struct sfm_reader *,
276 const struct sfm_extension_record *,
277 struct dictionary *);
278 static void parse_long_var_name_map (struct sfm_reader *,
279 const struct sfm_extension_record *,
280 struct dictionary *);
281 static void parse_long_string_map (struct sfm_reader *,
282 const struct sfm_extension_record *,
283 struct dictionary *);
284 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
285 const struct sfm_var_record *,
287 const struct sfm_value_label_record *);
288 static void parse_data_file_attributes (struct sfm_reader *,
289 const struct sfm_extension_record *,
290 struct dictionary *);
291 static void parse_variable_attributes (struct sfm_reader *,
292 const struct sfm_extension_record *,
293 struct dictionary *);
294 static void parse_long_string_value_labels (struct sfm_reader *,
295 const struct sfm_extension_record *,
296 struct dictionary *);
298 /* Frees the strings inside INFO. */
300 sfm_read_info_destroy (struct sfm_read_info *info)
304 free (info->creation_date);
305 free (info->creation_time);
306 free (info->product);
310 /* Opens the system file designated by file handle FH for reading. Reads the
311 system file's dictionary into *DICT.
313 If INFO is non-null, then it receives additional info about the system file,
314 which the caller must eventually free with sfm_read_info_destroy() when it
315 is no longer needed. */
317 sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
318 struct sfm_read_info *infop)
320 struct sfm_reader *volatile r = NULL;
321 struct sfm_read_info info;
323 struct sfm_header_record header;
325 struct sfm_var_record *vars;
326 size_t n_vars, allocated_vars;
328 struct sfm_value_label_record *labels;
329 size_t n_labels, allocated_labels;
331 struct sfm_document_record *document;
333 struct sfm_extension_record *extensions[32];
335 struct dictionary *dict = NULL;
338 /* Create and initialize reader. */
339 r = pool_create_container (struct sfm_reader, pool);
345 r->opcode_idx = sizeof r->opcodes;
346 r->corruption_warning = false;
348 memset (&info, 0, sizeof info);
350 /* TRANSLATORS: this fragment will be interpolated into
351 messages in fh_lock() that identify types of files. */
352 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
356 r->file = fn_open (fh_get_file_name (fh), "rb");
359 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
360 fh_get_file_name (r->fh), strerror (errno));
364 if (setjmp (r->bail_out))
368 read_header (r, &info, &header);
371 n_vars = allocated_vars = 0;
374 n_labels = allocated_labels = 0;
378 memset (extensions, 0, sizeof extensions);
388 read_int (r); /* Skip filler. */
395 if (n_vars >= allocated_vars)
396 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
398 read_variable_record (r, &vars[n_vars++]);
402 if (n_labels >= allocated_labels)
403 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
405 read_value_label_record (r, &labels[n_labels++], n_vars);
409 /* A Type 4 record is always immediately after a type 3 record,
410 so the code for type 3 records reads the type 4 record too. */
411 sys_error (r, r->pos, _("Misplaced type 4 record."));
414 if (document != NULL)
415 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
416 document = read_document_record (r);
420 subtype = read_int (r);
421 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
424 _("Unrecognized record type 7, subtype %d. Please "
425 "send a copy of this file, and the syntax which "
426 "created it to %s."),
427 subtype, PACKAGE_BUGREPORT);
428 skip_extension_record (r, subtype);
430 else if (extensions[subtype] != NULL)
433 _("Record type 7, subtype %d found here has the same "
434 "type as the record found near offset 0x%llx. "
435 "Please send a copy of this file, and the syntax "
436 "which created it to %s."),
437 subtype, (long long int) extensions[subtype]->pos,
439 skip_extension_record (r, subtype);
442 extensions[subtype] = read_extension_record (r, subtype);
446 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
451 /* Now actually parse what we read.
453 First, figure out the correct character encoding, because this determines
454 how the rest of the header data is to be interpreted. */
455 dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
456 extensions[EXT_ENCODING]));
457 r->encoding = dict_get_encoding (dict);
459 /* These records don't use variables at all. */
460 if (document != NULL)
461 parse_document (dict, document);
463 if (extensions[EXT_INTEGER] != NULL)
464 parse_machine_integer_info (r, extensions[EXT_INTEGER], &info);
466 if (extensions[EXT_FLOAT] != NULL)
467 parse_machine_float_info (r, extensions[EXT_FLOAT]);
469 if (extensions[EXT_FILE_ATTRS] != NULL)
470 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
472 parse_header (r, &header, &info, dict);
474 /* Parse the variable records, the basis of almost everything else. */
475 parse_variable_records (r, dict, vars, n_vars);
477 /* Parse value labels and the weight variable immediately after the variable
478 records. These records use indexes into var_recs[], so we must parse them
479 before those indexes become invalidated by very long string variables. */
480 for (i = 0; i < n_labels; i++)
481 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
482 if (header.weight_idx != 0)
484 struct variable *weight_var;
486 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
488 if (var_is_numeric (weight_var))
489 dict_set_weight (dict, weight_var);
491 sys_error (r, -1, _("Weighting variable must be numeric "
492 "(not string variable `%s')."),
493 var_get_name (weight_var));
496 if (extensions[EXT_DISPLAY] != NULL)
497 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
499 /* The following records use short names, so they need to be parsed before
500 parse_long_var_name_map() changes short names to long names. */
501 if (extensions[EXT_MRSETS] != NULL)
502 parse_mrsets (r, extensions[EXT_MRSETS], dict);
504 if (extensions[EXT_MRSETS2] != NULL)
505 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
507 if (extensions[EXT_LONG_STRINGS] != NULL)
508 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
510 /* Now rename variables to their long names. */
511 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
513 /* The following records use long names, so they need to follow renaming. */
514 if (extensions[EXT_VAR_ATTRS] != NULL)
515 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
517 if (extensions[EXT_LONG_LABELS] != NULL)
518 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
520 /* Warn if the actual amount of data per case differs from the
521 amount that the header claims. SPSS version 13 gets this
522 wrong when very long strings are involved, so don't warn in
524 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
525 && info.version_major != 13)
526 sys_warn (r, -1, _("File header claims %d variable positions but "
527 "%zu were read from file."),
528 header.nominal_case_size, n_vars);
530 /* Create an index of dictionary variable widths for
531 sfm_read_case to use. We cannot use the `struct variable's
532 from the dictionary we created, because the caller owns the
533 dictionary and may destroy or modify its variables. */
534 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
535 pool_register (r->pool, free, r->sfm_vars);
536 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
542 sfm_read_info_destroy (&info);
544 return casereader_create_sequential
546 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
547 &sys_file_casereader_class, r);
550 sfm_read_info_destroy (&info);
557 /* Closes a system file after we're done with it.
558 Returns true if an I/O error has occurred on READER, false
561 close_reader (struct sfm_reader *r)
570 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
572 msg (ME, _("Error closing system file `%s': %s."),
573 fh_get_file_name (r->fh), strerror (errno));
583 pool_destroy (r->pool);
588 /* Destroys READER. */
590 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
592 struct sfm_reader *r = r_;
596 /* Returns true if FILE is an SPSS system file,
599 sfm_detect (FILE *file)
603 if (fread (rec_type, 4, 1, file) != 1)
607 return !strcmp ("$FL2", rec_type);
610 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
611 except for the string fields in *INFO, which parse_header() will initialize
612 later once the file's encoding is known. */
614 read_header (struct sfm_reader *r, struct sfm_read_info *info,
615 struct sfm_header_record *header)
618 uint8_t raw_layout_code[4];
621 read_string (r, rec_type, sizeof rec_type);
622 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
624 if (strcmp ("$FL2", rec_type) != 0)
625 sys_error (r, 0, _("This is not an SPSS system file."));
627 /* Identify integer format. */
628 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
629 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
631 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
633 || (r->integer_format != INTEGER_MSB_FIRST
634 && r->integer_format != INTEGER_LSB_FIRST))
635 sys_error (r, 64, _("This is not an SPSS system file."));
637 header->nominal_case_size = read_int (r);
638 if (header->nominal_case_size < 0
639 || header->nominal_case_size > INT_MAX / 16)
640 header->nominal_case_size = -1;
642 r->compressed = read_int (r) != 0;
644 header->weight_idx = read_int (r);
646 r->case_cnt = read_int (r);
647 if ( r->case_cnt > INT_MAX / 2)
650 /* Identify floating-point format and obtain compression bias. */
651 read_bytes (r, raw_bias, sizeof raw_bias);
652 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
654 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
656 if (memcmp (raw_bias, zero_bias, 8))
657 sys_warn (r, r->pos - 8,
658 _("Compression bias is not the usual "
659 "value of 100, or system file uses unrecognized "
660 "floating-point format."));
663 /* Some software is known to write all-zeros to this
664 field. Such software also writes floating-point
665 numbers in the format that we expect by default
666 (it seems that all software most likely does, in
667 reality), so don't warn in this case. */
670 if (r->integer_format == INTEGER_MSB_FIRST)
671 r->float_format = FLOAT_IEEE_DOUBLE_BE;
673 r->float_format = FLOAT_IEEE_DOUBLE_LE;
675 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
677 read_string (r, header->creation_date, sizeof header->creation_date);
678 read_string (r, header->creation_time, sizeof header->creation_time);
679 read_string (r, header->file_label, sizeof header->file_label);
682 info->integer_format = r->integer_format;
683 info->float_format = r->float_format;
684 info->compressed = r->compressed;
685 info->case_cnt = r->case_cnt;
688 /* Reads a variable (type 2) record from R into RECORD. */
690 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
692 int has_variable_label;
694 memset (record, 0, sizeof *record);
696 record->pos = r->pos;
697 record->width = read_int (r);
698 has_variable_label = read_int (r);
699 record->missing_value_code = read_int (r);
700 record->print_format = read_int (r);
701 record->write_format = read_int (r);
702 read_bytes (r, record->name, sizeof record->name);
704 if (has_variable_label == 1)
706 enum { MAX_LABEL_LEN = 255 };
707 size_t len, read_len;
711 /* Read up to MAX_LABEL_LEN bytes of label. */
712 read_len = MIN (MAX_LABEL_LEN, len);
713 record->label = pool_malloc (r->pool, read_len + 1);
714 read_string (r, record->label, read_len + 1);
716 /* Skip unread label bytes. */
717 skip_bytes (r, len - read_len);
719 /* Skip label padding up to multiple of 4 bytes. */
720 skip_bytes (r, ROUND_UP (len, 4) - len);
722 else if (has_variable_label != 0)
723 sys_error (r, record->pos,
724 _("Variable label indicator field is not 0 or 1."));
726 /* Set missing values. */
727 if (record->missing_value_code != 0)
729 int code = record->missing_value_code;
730 if (record->width == 0)
732 if (code < -3 || code > 3 || code == -1)
733 sys_error (r, record->pos,
734 _("Numeric missing value indicator field is not "
735 "-3, -2, 0, 1, 2, or 3."));
739 if (code < 1 || code > 3)
740 sys_error (r, record->pos,
741 _("String missing value indicator field is not "
745 read_bytes (r, record->missing, 8 * abs (code));
749 /* Reads value labels from R into RECORD. */
751 read_value_label_record (struct sfm_reader *r,
752 struct sfm_value_label_record *record,
757 /* Read type 3 record. */
758 record->pos = r->pos;
759 record->n_labels = read_int (r);
760 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
761 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
763 record->labels = pool_nmalloc (r->pool, record->n_labels,
764 sizeof *record->labels);
765 for (i = 0; i < record->n_labels; i++)
767 struct sfm_value_label *label = &record->labels[i];
768 unsigned char label_len;
771 read_bytes (r, label->value, sizeof label->value);
773 /* Read label length. */
774 read_bytes (r, &label_len, sizeof label_len);
775 padded_len = ROUND_UP (label_len + 1, 8);
777 /* Read label, padding. */
778 label->label = pool_malloc (r->pool, padded_len + 1);
779 read_bytes (r, label->label, padded_len - 1);
780 label->label[label_len] = '\0';
783 /* Read record type of type 4 record. */
784 if (read_int (r) != 4)
785 sys_error (r, r->pos - 4,
786 _("Variable index record (type 4) does not immediately "
787 "follow value label record (type 3) as it should."));
789 /* Read number of variables associated with value label from type 4
791 record->n_vars = read_int (r);
792 if (record->n_vars < 1 || record->n_vars > n_vars)
793 sys_error (r, r->pos - 4,
794 _("Number of variables associated with a value label (%zu) "
795 "is not between 1 and the number of variables (%zu)."),
796 record->n_vars, n_vars);
797 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
798 for (i = 0; i < record->n_vars; i++)
799 record->vars[i] = read_int (r);
802 /* Reads a document record from R and returns it. */
803 static struct sfm_document_record *
804 read_document_record (struct sfm_reader *r)
806 struct sfm_document_record *record;
809 record = pool_malloc (r->pool, sizeof *record);
810 record->pos = r->pos;
812 n_lines = read_int (r);
813 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
814 sys_error (r, record->pos,
815 _("Number of document lines (%d) "
816 "must be greater than 0 and less than %d."),
817 n_lines, INT_MAX / DOC_LINE_LENGTH);
819 record->n_lines = n_lines;
820 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
821 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
827 read_extension_record_header (struct sfm_reader *r, int subtype,
828 struct sfm_extension_record *record)
830 record->pos = r->pos;
831 record->size = read_int (r);
832 record->count = read_int (r);
834 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
835 allows an extra byte for a null terminator, used by some
836 extension processing routines. */
837 if (record->size != 0
838 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
839 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
842 /* Reads an extension record from R into RECORD. */
843 static struct sfm_extension_record *
844 read_extension_record (struct sfm_reader *r, int subtype)
846 struct extension_record_type
853 static const struct extension_record_type types[] =
855 /* Implemented record types. */
856 { EXT_INTEGER, 4, 8 },
858 { EXT_MRSETS, 1, 0 },
859 { EXT_DISPLAY, 4, 0 },
860 { EXT_LONG_NAMES, 1, 0 },
861 { EXT_LONG_STRINGS, 1, 0 },
862 { EXT_NCASES, 8, 2 },
863 { EXT_FILE_ATTRS, 1, 0 },
864 { EXT_VAR_ATTRS, 1, 0 },
865 { EXT_MRSETS2, 1, 0 },
866 { EXT_ENCODING, 1, 0 },
867 { EXT_LONG_LABELS, 1, 0 },
869 /* Ignored record types. */
870 { EXT_VAR_SETS, 0, 0 },
872 { EXT_DATA_ENTRY, 0, 0 },
875 const struct extension_record_type *type;
876 struct sfm_extension_record *record;
879 record = pool_malloc (r->pool, sizeof *record);
880 read_extension_record_header (r, subtype, record);
881 n_bytes = record->count * record->size;
883 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
884 if (subtype == type->subtype)
886 if (type->size > 0 && record->size != type->size)
887 sys_warn (r, record->pos,
888 _("Record type 7, subtype %d has bad size %zu "
889 "(expected %d)."), subtype, record->size, type->size);
890 else if (type->count > 0 && record->count != type->count)
891 sys_warn (r, record->pos,
892 _("Record type 7, subtype %d has bad count %zu "
893 "(expected %d)."), subtype, record->count, type->count);
894 else if (type->count == 0 && type->size == 0)
896 /* Ignore this record. */
900 char *data = pool_malloc (r->pool, n_bytes + 1);
901 data[n_bytes] = '\0';
904 read_bytes (r, record->data, n_bytes);
911 sys_warn (r, record->pos,
912 _("Unrecognized record type 7, subtype %d. Please send a "
913 "copy of this file, and the syntax which created it to %s."),
914 subtype, PACKAGE_BUGREPORT);
917 skip_bytes (r, n_bytes);
922 skip_extension_record (struct sfm_reader *r, int subtype)
924 struct sfm_extension_record record;
926 read_extension_record_header (r, subtype, &record);
927 skip_bytes (r, record.count * record.size);
931 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
932 struct sfm_read_info *info, struct dictionary *dict)
934 const char *dict_encoding = dict_get_encoding (dict);
935 struct substring product;
936 struct substring label;
938 /* Convert file label to UTF-8 and put it into DICT. */
939 label = recode_substring_pool ("UTF-8", dict_encoding,
940 ss_cstr (header->file_label), r->pool);
941 ss_trim (&label, ss_cstr (" "));
942 label.string[label.length] = '\0';
943 dict_set_label (dict, label.string);
945 /* Put creation date and time in UTF-8 into INFO. */
946 info->creation_date = recode_string ("UTF-8", dict_encoding,
947 header->creation_date, -1);
948 info->creation_time = recode_string ("UTF-8", dict_encoding,
949 header->creation_time, -1);
951 /* Put product name into INFO, dropping eye-catcher string if present. */
952 product = recode_substring_pool ("UTF-8", dict_encoding,
953 ss_cstr (header->eye_catcher), r->pool);
954 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
955 ss_trim (&product, ss_cstr (" "));
956 info->product = ss_xstrdup (product);
959 /* Reads a variable (type 2) record from R and adds the
960 corresponding variable to DICT.
961 Also skips past additional variable records for long string
964 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
965 struct sfm_var_record *var_recs, size_t n_var_recs)
967 const char *dict_encoding = dict_get_encoding (dict);
968 struct sfm_var_record *rec;
971 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
973 struct variable *var;
978 name = recode_string_pool ("UTF-8", dict_encoding,
979 rec->name, 8, r->pool);
980 name[strcspn (name, " ")] = '\0';
982 if (!dict_id_is_valid (dict, name, false)
983 || name[0] == '$' || name[0] == '#')
984 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
986 if (rec->width < 0 || rec->width > 255)
987 sys_error (r, rec->pos,
988 _("Bad width %d for variable %s."), rec->width, name);
990 var = rec->var = dict_create_var (dict, name, rec->width);
992 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
994 /* Set the short name the same as the long name. */
995 var_set_short_name (var, 0, name);
997 /* Get variable label, if any. */
1002 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1003 rec->label, -1, r->pool);
1004 var_set_label (var, utf8_label, false);
1007 /* Set missing values. */
1008 if (rec->missing_value_code != 0)
1010 int width = var_get_width (var);
1011 struct missing_values mv;
1013 mv_init_pool (r->pool, &mv, width);
1014 if (var_is_numeric (var))
1016 bool has_range = rec->missing_value_code < 0;
1017 int n_discrete = (has_range
1018 ? rec->missing_value_code == -3
1019 : rec->missing_value_code);
1024 double low = parse_float (r, rec->missing, 0);
1025 double high = parse_float (r, rec->missing, 8);
1026 mv_add_range (&mv, low, high);
1030 for (i = 0; i < n_discrete; i++)
1032 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1040 value_init_pool (r->pool, &value, width);
1041 value_set_missing (&value, width);
1042 for (i = 0; i < rec->missing_value_code; i++)
1044 uint8_t *s = value_str_rw (&value, width);
1045 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1046 mv_add_str (&mv, s);
1049 var_set_missing_values (var, &mv);
1053 parse_format_spec (r, rec->pos + 12, rec->print_format,
1054 PRINT_FORMAT, var, &n_warnings);
1055 parse_format_spec (r, rec->pos + 16, rec->write_format,
1056 WRITE_FORMAT, var, &n_warnings);
1058 /* Account for values.
1059 Skip long string continuation records, if any. */
1060 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1061 for (i = 1; i < n_values; i++)
1062 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1063 sys_error (r, rec->pos, _("Missing string continuation record."));
1068 /* Translates the format spec from sysfile format to internal
1071 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1072 enum which_format which, struct variable *v,
1075 const int max_warnings = 8;
1076 uint8_t raw_type = format >> 16;
1077 uint8_t w = format >> 8;
1086 ok = (fmt_from_io (raw_type, &f.type)
1087 && fmt_check_output (&f)
1088 && fmt_check_width_compat (&f, var_get_width (v)));
1093 if (which == PRINT_FORMAT)
1094 var_set_print_format (v, &f);
1096 var_set_write_format (v, &f);
1098 else if (format == 0)
1100 /* Actually observed in the wild. No point in warning about it. */
1102 else if (++*n_warnings <= max_warnings)
1104 if (which == PRINT_FORMAT)
1105 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1107 var_get_name (v), var_get_width (v), format);
1109 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1111 var_get_name (v), var_get_width (v), format);
1113 if (*n_warnings == max_warnings)
1114 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1119 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1123 for (p = record->documents;
1124 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1125 p += DOC_LINE_LENGTH)
1127 struct substring line;
1129 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1130 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1131 ss_rtrim (&line, ss_cstr (" "));
1132 line.string[line.length] = '\0';
1134 dict_add_document_line (dict, line.string, false);
1140 /* Parses record type 7, subtype 3. */
1142 parse_machine_integer_info (struct sfm_reader *r,
1143 const struct sfm_extension_record *record,
1144 struct sfm_read_info *info)
1146 int float_representation, expected_float_format;
1147 int integer_representation, expected_integer_format;
1149 /* Save version info. */
1150 info->version_major = parse_int (r, record->data, 0);
1151 info->version_minor = parse_int (r, record->data, 4);
1152 info->version_revision = parse_int (r, record->data, 8);
1154 /* Check floating point format. */
1155 float_representation = parse_int (r, record->data, 16);
1156 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1157 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1158 expected_float_format = 1;
1159 else if (r->float_format == FLOAT_Z_LONG)
1160 expected_float_format = 2;
1161 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1162 expected_float_format = 3;
1165 if (float_representation != expected_float_format)
1166 sys_error (r, record->pos, _("Floating-point representation indicated by "
1167 "system file (%d) differs from expected (%d)."),
1168 float_representation, expected_float_format);
1170 /* Check integer format. */
1171 integer_representation = parse_int (r, record->data, 24);
1172 if (r->integer_format == INTEGER_MSB_FIRST)
1173 expected_integer_format = 1;
1174 else if (r->integer_format == INTEGER_LSB_FIRST)
1175 expected_integer_format = 2;
1178 if (integer_representation != expected_integer_format)
1179 sys_warn (r, record->pos,
1180 _("Integer format indicated by system file (%d) "
1181 "differs from expected (%d)."),
1182 integer_representation, expected_integer_format);
1187 choose_encoding (struct sfm_reader *r,
1188 const struct sfm_extension_record *ext_integer,
1189 const struct sfm_extension_record *ext_encoding)
1191 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1194 return ext_encoding->data;
1196 /* But EXT_INTEGER is better than nothing as a fallback. */
1199 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1200 const char *encoding;
1209 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1210 respectively. However, there are known to be many files in the wild
1211 with character code 2, yet have data which are clearly not ASCII.
1212 Therefore we ignore these values. */
1219 encoding = sys_get_encoding_from_codepage (codepage);
1220 if (encoding != NULL)
1226 return locale_charset ();
1229 /* Parses record type 7, subtype 4. */
1231 parse_machine_float_info (struct sfm_reader *r,
1232 const struct sfm_extension_record *record)
1234 double sysmis = parse_float (r, record->data, 0);
1235 double highest = parse_float (r, record->data, 8);
1236 double lowest = parse_float (r, record->data, 16);
1238 if (sysmis != SYSMIS)
1239 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1242 if (highest != HIGHEST)
1243 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1244 highest, "HIGHEST");
1246 if (lowest != LOWEST)
1247 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1251 /* Parses record type 7, subtype 7 or 19. */
1253 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1254 struct dictionary *dict)
1256 struct text_record *text;
1257 struct mrset *mrset;
1259 text = open_text_record (r, record, false);
1262 const char *counted = NULL;
1265 struct stringi_set var_names;
1266 size_t allocated_vars;
1270 mrset = xzalloc (sizeof *mrset);
1272 name = text_get_token (text, ss_cstr ("="), NULL);
1275 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1277 if (mrset->name[0] != '$')
1279 sys_warn (r, record->pos,
1280 _("`%s' does not begin with `$' at offset %zu "
1281 "in MRSETS record."), mrset->name, text_pos (text));
1285 if (text_match (text, 'C'))
1287 mrset->type = MRSET_MC;
1288 if (!text_match (text, ' '))
1290 sys_warn (r, record->pos,
1291 _("Missing space following `%c' at offset %zu "
1292 "in MRSETS record."), 'C', text_pos (text));
1296 else if (text_match (text, 'D'))
1298 mrset->type = MRSET_MD;
1299 mrset->cat_source = MRSET_VARLABELS;
1301 else if (text_match (text, 'E'))
1305 mrset->type = MRSET_MD;
1306 mrset->cat_source = MRSET_COUNTEDVALUES;
1307 if (!text_match (text, ' '))
1309 sys_warn (r, record->pos,
1310 _("Missing space following `%c' at offset %zu "
1311 "in MRSETS record."), 'E', text_pos (text));
1315 number = text_get_token (text, ss_cstr (" "), NULL);
1316 if (!strcmp (number, "11"))
1317 mrset->label_from_var_label = true;
1318 else if (strcmp (number, "1"))
1319 sys_warn (r, record->pos,
1320 _("Unexpected label source value `%s' following `E' "
1321 "at offset %zu in MRSETS record."),
1322 number, text_pos (text));
1326 sys_warn (r, record->pos,
1327 _("Missing `C', `D', or `E' at offset %zu "
1328 "in MRSETS record."),
1333 if (mrset->type == MRSET_MD)
1335 counted = text_parse_counted_string (r, text);
1336 if (counted == NULL)
1340 label = text_parse_counted_string (r, text);
1343 if (label[0] != '\0')
1344 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1346 stringi_set_init (&var_names);
1351 const char *raw_var_name;
1352 struct variable *var;
1355 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1356 if (raw_var_name == NULL)
1358 sys_warn (r, record->pos,
1359 _("Missing new-line parsing variable names "
1360 "at offset %zu in MRSETS record."),
1364 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1366 var = dict_lookup_var (dict, var_name);
1372 if (!stringi_set_insert (&var_names, var_name))
1374 sys_warn (r, record->pos,
1375 _("Duplicate variable name %s "
1376 "at offset %zu in MRSETS record."),
1377 var_name, text_pos (text));
1383 if (mrset->label == NULL && mrset->label_from_var_label
1384 && var_has_label (var))
1385 mrset->label = xstrdup (var_get_label (var));
1388 && var_get_type (var) != var_get_type (mrset->vars[0]))
1390 sys_warn (r, record->pos,
1391 _("MRSET %s contains both string and "
1392 "numeric variables."), name);
1395 width = MIN (width, var_get_width (var));
1397 if (mrset->n_vars >= allocated_vars)
1398 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1399 sizeof *mrset->vars);
1400 mrset->vars[mrset->n_vars++] = var;
1402 while (delimiter != '\n');
1404 if (mrset->n_vars < 2)
1406 sys_warn (r, record->pos,
1407 _("MRSET %s has only %zu variables."), mrset->name,
1409 mrset_destroy (mrset);
1413 if (mrset->type == MRSET_MD)
1415 mrset->width = width;
1416 value_init (&mrset->counted, width);
1418 mrset->counted.f = strtod (counted, NULL);
1420 value_copy_str_rpad (&mrset->counted, width,
1421 (const uint8_t *) counted, ' ');
1424 dict_add_mrset (dict, mrset);
1426 stringi_set_destroy (&var_names);
1428 mrset_destroy (mrset);
1429 close_text_record (r, text);
1432 /* Read record type 7, subtype 11, which specifies how variables
1433 should be displayed in GUI environments. */
1435 parse_display_parameters (struct sfm_reader *r,
1436 const struct sfm_extension_record *record,
1437 struct dictionary *dict)
1439 bool includes_width;
1440 bool warned = false;
1445 n_vars = dict_get_var_cnt (dict);
1446 if (record->count == 3 * n_vars)
1447 includes_width = true;
1448 else if (record->count == 2 * n_vars)
1449 includes_width = false;
1452 sys_warn (r, record->pos,
1453 _("Extension 11 has bad count %zu (for %zu variables)."),
1454 record->count, n_vars);
1459 for (i = 0; i < n_vars; ++i)
1461 struct variable *v = dict_get_var (dict, i);
1462 int measure, width, align;
1464 measure = parse_int (r, record->data, ofs);
1469 width = parse_int (r, record->data, ofs);
1475 align = parse_int (r, record->data, ofs);
1478 /* SPSS 14 sometimes seems to set string variables' measure
1480 if (0 == measure && var_is_alpha (v))
1483 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1486 sys_warn (r, record->pos,
1487 _("Invalid variable display parameters for variable "
1488 "%zu (%s). Default parameters substituted."),
1489 i, var_get_name (v));
1494 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1495 : measure == 2 ? MEASURE_ORDINAL
1497 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1498 : align == 1 ? ALIGN_RIGHT
1501 /* Older versions (SPSS 9.0) sometimes set the display
1502 width to zero. This causes confusion in the GUI, so
1503 only set the width if it is nonzero. */
1505 var_set_display_width (v, width);
1510 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1511 const char *new_name)
1513 size_t n_short_names;
1517 /* Renaming a variable may clear its short names, but we
1518 want to retain them, so we save them and re-set them
1520 n_short_names = var_get_short_name_cnt (var);
1521 short_names = xnmalloc (n_short_names, sizeof *short_names);
1522 for (i = 0; i < n_short_names; i++)
1524 const char *s = var_get_short_name (var, i);
1525 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1528 /* Set long name. */
1529 dict_rename_var (dict, var, new_name);
1531 /* Restore short names. */
1532 for (i = 0; i < n_short_names; i++)
1534 var_set_short_name (var, i, short_names[i]);
1535 free (short_names[i]);
1540 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1541 to each short name. Modifies variable names in DICT accordingly. */
1543 parse_long_var_name_map (struct sfm_reader *r,
1544 const struct sfm_extension_record *record,
1545 struct dictionary *dict)
1547 struct text_record *text;
1548 struct variable *var;
1553 /* Convert variable names to lowercase. */
1556 for (i = 0; i < dict_get_var_cnt (dict); i++)
1558 struct variable *var = dict_get_var (dict, i);
1561 new_name = xstrdup (var_get_name (var));
1562 str_lowercase (new_name);
1564 rename_var_and_save_short_names (dict, var, new_name);
1572 /* Rename each of the variables, one by one. (In a correctly constructed
1573 system file, this cannot create any intermediate duplicate variable names,
1574 because all of the new variable names are longer than any of the old
1575 variable names and thus there cannot be any overlaps.) */
1576 text = open_text_record (r, record, true);
1577 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1579 /* Validate long name. */
1580 /* XXX need to reencode name to UTF-8 */
1581 if (!dict_id_is_valid (dict, long_name, false))
1583 sys_warn (r, record->pos,
1584 _("Long variable mapping from %s to invalid "
1585 "variable name `%s'."),
1586 var_get_name (var), long_name);
1590 /* Identify any duplicates. */
1591 if (strcasecmp (var_get_short_name (var, 0), long_name)
1592 && dict_lookup_var (dict, long_name) != NULL)
1594 sys_warn (r, record->pos,
1595 _("Duplicate long variable name `%s'."), long_name);
1599 rename_var_and_save_short_names (dict, var, long_name);
1601 close_text_record (r, text);
1604 /* Reads record type 7, subtype 14, which gives the real length
1605 of each very long string. Rearranges DICT accordingly. */
1607 parse_long_string_map (struct sfm_reader *r,
1608 const struct sfm_extension_record *record,
1609 struct dictionary *dict)
1611 struct text_record *text;
1612 struct variable *var;
1615 text = open_text_record (r, record, true);
1616 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1618 size_t idx = var_get_dict_index (var);
1624 length = strtol (length_s, NULL, 10);
1625 if (length < 1 || length > MAX_STRING)
1627 sys_warn (r, record->pos,
1628 _("%s listed as string of invalid length %s "
1629 "in very long string record."),
1630 var_get_name (var), length_s);
1634 /* Check segments. */
1635 segment_cnt = sfm_width_to_segments (length);
1636 if (segment_cnt == 1)
1638 sys_warn (r, record->pos,
1639 _("%s listed in very long string record with width %s, "
1640 "which requires only one segment."),
1641 var_get_name (var), length_s);
1644 if (idx + segment_cnt > dict_get_var_cnt (dict))
1645 sys_error (r, record->pos,
1646 _("Very long string %s overflows dictionary."),
1647 var_get_name (var));
1649 /* Get the short names from the segments and check their
1651 for (i = 0; i < segment_cnt; i++)
1653 struct variable *seg = dict_get_var (dict, idx + i);
1654 int alloc_width = sfm_segment_alloc_width (length, i);
1655 int width = var_get_width (seg);
1658 var_set_short_name (var, i, var_get_short_name (seg, 0));
1659 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1660 sys_error (r, record->pos,
1661 _("Very long string with width %ld has segment %d "
1662 "of width %d (expected %d)."),
1663 length, i, width, alloc_width);
1665 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1666 var_set_width (var, length);
1668 close_text_record (r, text);
1669 dict_compact_values (dict);
1673 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1674 const struct sfm_var_record *var_recs, size_t n_var_recs,
1675 const struct sfm_value_label_record *record)
1677 struct variable **vars;
1681 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1682 for (i = 0; i < record->n_labels; i++)
1683 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1684 record->labels[i].label, -1,
1687 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1688 for (i = 0; i < record->n_vars; i++)
1689 vars[i] = lookup_var_by_index (r, record->pos,
1690 var_recs, n_var_recs, record->vars[i]);
1692 for (i = 1; i < record->n_vars; i++)
1693 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1694 sys_error (r, record->pos,
1695 _("Variables associated with value label are not all of "
1696 "identical type. Variable %s is %s, but variable "
1698 var_get_name (vars[0]),
1699 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1700 var_get_name (vars[i]),
1701 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1703 for (i = 0; i < record->n_vars; i++)
1705 struct variable *var = vars[i];
1709 width = var_get_width (var);
1711 sys_error (r, record->pos,
1712 _("Value labels may not be added to long string "
1713 "variables (e.g. %s) using records types 3 and 4."),
1714 var_get_name (var));
1716 for (j = 0; j < record->n_labels; j++)
1718 struct sfm_value_label *label = &record->labels[j];
1721 value_init (&value, width);
1723 value.f = parse_float (r, label->value, 0);
1725 memcpy (value_str_rw (&value, width), label->value, width);
1727 if (!var_add_value_label (var, &value, utf8_labels[j]))
1729 if (var_is_numeric (var))
1730 sys_warn (r, record->pos,
1731 _("Duplicate value label for %g on %s."),
1732 value.f, var_get_name (var));
1734 sys_warn (r, record->pos,
1735 _("Duplicate value label for `%.*s' on %s."),
1736 width, value_str (&value, width),
1737 var_get_name (var));
1740 value_destroy (&value, width);
1744 pool_free (r->pool, vars);
1745 for (i = 0; i < record->n_labels; i++)
1746 pool_free (r->pool, utf8_labels[i]);
1747 pool_free (r->pool, utf8_labels);
1750 static struct variable *
1751 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1752 const struct sfm_var_record *var_recs, size_t n_var_recs,
1755 const struct sfm_var_record *rec;
1757 if (idx < 1 || idx > n_var_recs)
1759 sys_error (r, offset,
1760 _("Variable index %d not in valid range 1...%zu."),
1765 rec = &var_recs[idx - 1];
1766 if (rec->var == NULL)
1768 sys_error (r, offset,
1769 _("Variable index %d refers to long string continuation."),
1777 /* Parses a set of custom attributes from TEXT into ATTRS.
1778 ATTRS may be a null pointer, in which case the attributes are
1779 read but discarded. */
1781 parse_attributes (struct sfm_reader *r, struct text_record *text,
1782 struct attrset *attrs)
1786 struct attribute *attr;
1790 /* Parse the key. */
1791 key = text_get_token (text, ss_cstr ("("), NULL);
1795 attr = attribute_create (key);
1796 for (index = 1; ; index++)
1798 /* Parse the value. */
1802 value = text_get_token (text, ss_cstr ("\n"), NULL);
1805 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1810 length = strlen (value);
1811 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1813 value[length - 1] = '\0';
1814 attribute_add_value (attr, value + 1);
1819 _("Attribute value %s[%d] is not quoted: %s."),
1821 attribute_add_value (attr, value);
1824 /* Was this the last value for this attribute? */
1825 if (text_match (text, ')'))
1829 attrset_add (attrs, attr);
1831 attribute_destroy (attr);
1833 while (!text_match (text, '/'));
1836 /* Reads record type 7, subtype 17, which lists custom
1837 attributes on the data file. */
1839 parse_data_file_attributes (struct sfm_reader *r,
1840 const struct sfm_extension_record *record,
1841 struct dictionary *dict)
1843 struct text_record *text = open_text_record (r, record, true);
1844 parse_attributes (r, text, dict_get_attributes (dict));
1845 close_text_record (r, text);
1848 /* Parses record type 7, subtype 18, which lists custom
1849 attributes on individual variables. */
1851 parse_variable_attributes (struct sfm_reader *r,
1852 const struct sfm_extension_record *record,
1853 struct dictionary *dict)
1855 struct text_record *text;
1856 struct variable *var;
1858 text = open_text_record (r, record, true);
1859 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1860 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1861 close_text_record (r, text);
1865 check_overflow (struct sfm_reader *r,
1866 const struct sfm_extension_record *record,
1867 size_t ofs, size_t length)
1869 size_t end = record->size * record->count;
1870 if (length >= end || ofs + length > end)
1871 sys_error (r, record->pos + end,
1872 _("Long string value label record ends unexpectedly."));
1876 parse_long_string_value_labels (struct sfm_reader *r,
1877 const struct sfm_extension_record *record,
1878 struct dictionary *dict)
1880 const char *dict_encoding = dict_get_encoding (dict);
1881 size_t end = record->size * record->count;
1888 struct variable *var;
1893 /* Parse variable name length. */
1894 check_overflow (r, record, ofs, 4);
1895 var_name_len = parse_int (r, record->data, ofs);
1898 /* Parse variable name, width, and number of labels. */
1899 check_overflow (r, record, ofs, var_name_len + 8);
1900 var_name = recode_string_pool ("UTF-8", dict_encoding,
1901 (const char *) record->data + ofs,
1902 var_name_len, r->pool);
1903 width = parse_int (r, record->data, ofs + var_name_len);
1904 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1905 ofs += var_name_len + 8;
1907 /* Look up 'var' and validate. */
1908 var = dict_lookup_var (dict, var_name);
1910 sys_warn (r, record->pos + ofs,
1911 _("Ignoring long string value record for "
1912 "unknown variable %s."), var_name);
1913 else if (var_is_numeric (var))
1915 sys_warn (r, record->pos + ofs,
1916 _("Ignoring long string value record for "
1917 "numeric variable %s."), var_name);
1920 else if (width != var_get_width (var))
1922 sys_warn (r, record->pos + ofs,
1923 _("Ignoring long string value record for variable %s "
1924 "because the record's width (%d) does not match the "
1925 "variable's width (%d)."),
1926 var_name, width, var_get_width (var));
1931 value_init_pool (r->pool, &value, width);
1932 for (i = 0; i < n_labels; i++)
1934 size_t value_length, label_length;
1935 bool skip = var == NULL;
1937 /* Parse value length. */
1938 check_overflow (r, record, ofs, 4);
1939 value_length = parse_int (r, record->data, ofs);
1943 check_overflow (r, record, ofs, value_length);
1946 if (value_length == width)
1947 memcpy (value_str_rw (&value, width),
1948 (const uint8_t *) record->data + ofs, width);
1951 sys_warn (r, record->pos + ofs,
1952 _("Ignoring long string value %zu for variable "
1953 "%s, with width %d, that has bad value "
1955 i, var_get_name (var), width, value_length);
1959 ofs += value_length;
1961 /* Parse label length. */
1962 check_overflow (r, record, ofs, 4);
1963 label_length = parse_int (r, record->data, ofs);
1967 check_overflow (r, record, ofs, label_length);
1972 label = recode_string_pool ("UTF-8", dict_encoding,
1973 (const char *) record->data + ofs,
1974 label_length, r->pool);
1975 if (!var_add_value_label (var, &value, label))
1976 sys_warn (r, record->pos + ofs,
1977 _("Duplicate value label for `%.*s' on %s."),
1978 width, value_str (&value, width),
1979 var_get_name (var));
1980 pool_free (r->pool, label);
1982 ofs += label_length;
1989 static void partial_record (struct sfm_reader *r)
1992 static void read_error (struct casereader *, const struct sfm_reader *);
1994 static bool read_case_number (struct sfm_reader *, double *);
1995 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1996 static int read_opcode (struct sfm_reader *);
1997 static bool read_compressed_number (struct sfm_reader *, double *);
1998 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1999 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2000 static bool skip_whole_strings (struct sfm_reader *, size_t);
2002 /* Reads and returns one case from READER's file. Returns a null
2003 pointer if not successful. */
2004 static struct ccase *
2005 sys_file_casereader_read (struct casereader *reader, void *r_)
2007 struct sfm_reader *r = r_;
2008 struct ccase *volatile c;
2014 c = case_create (r->proto);
2015 if (setjmp (r->bail_out))
2017 casereader_force_error (reader);
2022 for (i = 0; i < r->sfm_var_cnt; i++)
2024 struct sfm_var *sv = &r->sfm_vars[i];
2025 union value *v = case_data_rw_idx (c, sv->case_index);
2027 if (sv->var_width == 0)
2029 if (!read_case_number (r, &v->f))
2034 uint8_t *s = value_str_rw (v, sv->var_width);
2035 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2037 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2046 if (r->case_cnt != -1)
2047 read_error (reader, r);
2052 /* Issues an error that R ends in a partial record. */
2054 partial_record (struct sfm_reader *r)
2056 sys_error (r, r->pos, _("File ends in partial case."));
2059 /* Issues an error that an unspecified error occurred SFM, and
2062 read_error (struct casereader *r, const struct sfm_reader *sfm)
2064 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2065 casereader_force_error (r);
2068 /* Reads a number from R and stores its value in *D.
2069 If R is compressed, reads a compressed number;
2070 otherwise, reads a number in the regular way.
2071 Returns true if successful, false if end of file is
2072 reached immediately. */
2074 read_case_number (struct sfm_reader *r, double *d)
2079 if (!try_read_bytes (r, number, sizeof number))
2081 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2085 return read_compressed_number (r, d);
2088 /* Reads LENGTH string bytes from R into S.
2089 Always reads a multiple of 8 bytes; if LENGTH is not a
2090 multiple of 8, then extra bytes are read and discarded without
2092 Reads compressed strings if S is compressed.
2093 Returns true if successful, false if end of file is
2094 reached immediately. */
2096 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2098 size_t whole = ROUND_DOWN (length, 8);
2099 size_t partial = length % 8;
2103 if (!read_whole_strings (r, s, whole))
2110 if (!read_whole_strings (r, bounce, sizeof bounce))
2116 memcpy (s + whole, bounce, partial);
2122 /* Reads and returns the next compression opcode from R. */
2124 read_opcode (struct sfm_reader *r)
2126 assert (r->compressed);
2130 if (r->opcode_idx >= sizeof r->opcodes)
2132 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2136 opcode = r->opcodes[r->opcode_idx++];
2143 /* Reads a compressed number from R and stores its value in D.
2144 Returns true if successful, false if end of file is
2145 reached immediately. */
2147 read_compressed_number (struct sfm_reader *r, double *d)
2149 int opcode = read_opcode (r);
2157 *d = read_float (r);
2161 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2162 if (!r->corruption_warning)
2164 r->corruption_warning = true;
2165 sys_warn (r, r->pos,
2166 _("Possible compressed data corruption: "
2167 "compressed spaces appear in numeric field."));
2176 *d = opcode - r->bias;
2183 /* Reads a compressed 8-byte string segment from R and stores it
2185 Returns true if successful, false if end of file is
2186 reached immediately. */
2188 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2190 int opcode = read_opcode (r);
2198 read_bytes (r, dst, 8);
2202 memset (dst, ' ', 8);
2207 double value = opcode - r->bias;
2208 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2211 /* This has actually been seen "in the wild". The submitter of the
2212 file that showed that the contents decoded as spaces, but they
2213 were at the end of the field so it's possible that the null
2214 bytes just acted as null terminators. */
2216 else if (!r->corruption_warning)
2218 r->corruption_warning = true;
2219 sys_warn (r, r->pos,
2220 _("Possible compressed data corruption: "
2221 "string contains compressed integer (opcode %d)."),
2231 /* Reads LENGTH string bytes from R into S.
2232 LENGTH must be a multiple of 8.
2233 Reads compressed strings if S is compressed.
2234 Returns true if successful, false if end of file is
2235 reached immediately. */
2237 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2239 assert (length % 8 == 0);
2241 return try_read_bytes (r, s, length);
2245 for (ofs = 0; ofs < length; ofs += 8)
2246 if (!read_compressed_string (r, s + ofs))
2256 /* Skips LENGTH string bytes from R.
2257 LENGTH must be a multiple of 8.
2258 (LENGTH is also limited to 1024, but that's only because the
2259 current caller never needs more than that many bytes.)
2260 Returns true if successful, false if end of file is
2261 reached immediately. */
2263 skip_whole_strings (struct sfm_reader *r, size_t length)
2265 uint8_t buffer[1024];
2266 assert (length < sizeof buffer);
2267 return read_whole_strings (r, buffer, length);
2270 /* Helpers for reading records that contain structured text
2273 /* Maximum number of warnings to issue for a single text
2275 #define MAX_TEXT_WARNINGS 5
2280 struct substring buffer; /* Record contents. */
2281 off_t start; /* Starting offset in file. */
2282 size_t pos; /* Current position in buffer. */
2283 int n_warnings; /* Number of warnings issued or suppressed. */
2284 bool recoded; /* Recoded into UTF-8? */
2287 static struct text_record *
2288 open_text_record (struct sfm_reader *r,
2289 const struct sfm_extension_record *record,
2290 bool recode_to_utf8)
2292 struct text_record *text;
2293 struct substring raw;
2295 text = pool_alloc (r->pool, sizeof *text);
2296 raw = ss_buffer (record->data, record->size * record->count);
2297 text->start = record->pos;
2298 text->buffer = (recode_to_utf8
2299 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2302 text->n_warnings = 0;
2303 text->recoded = recode_to_utf8;
2308 /* Closes TEXT, frees its storage, and issues a final warning
2309 about suppressed warnings if necesary. */
2311 close_text_record (struct sfm_reader *r, struct text_record *text)
2313 if (text->n_warnings > MAX_TEXT_WARNINGS)
2314 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2315 text->n_warnings - MAX_TEXT_WARNINGS);
2317 pool_free (r->pool, ss_data (text->buffer));
2320 /* Reads a variable=value pair from TEXT.
2321 Looks up the variable in DICT and stores it into *VAR.
2322 Stores a null-terminated value into *VALUE. */
2324 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2325 struct text_record *text,
2326 struct variable **var, char **value)
2330 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2333 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2337 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2338 ss_buffer ("\t\0", 2));
2346 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2347 struct text_record *text, struct substring delimiters,
2348 struct variable **var)
2352 name = text_get_token (text, delimiters, NULL);
2356 *var = dict_lookup_var (dict, name);
2360 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2367 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2368 struct text_record *text, struct substring delimiters,
2369 struct variable **var)
2371 char *short_name = text_get_token (text, delimiters, NULL);
2372 if (short_name == NULL)
2375 *var = dict_lookup_var (dict, short_name);
2377 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2382 /* Displays a warning for the current file position, limiting the
2383 number to MAX_TEXT_WARNINGS for TEXT. */
2385 text_warn (struct sfm_reader *r, struct text_record *text,
2386 const char *format, ...)
2388 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2392 va_start (args, format);
2393 sys_msg (r, text->start + text->pos, MW, format, args);
2399 text_get_token (struct text_record *text, struct substring delimiters,
2402 struct substring token;
2405 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2408 end = &ss_data (token)[ss_length (token)];
2409 if (delimiter != NULL)
2412 return ss_data (token);
2415 /* Reads a integer value expressed in decimal, then a space, then a string that
2416 consists of exactly as many bytes as specified by the integer, then a space,
2417 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2418 buffer (so the caller should not free the string). */
2420 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2430 int c = text->buffer.string[text->pos];
2431 if (c < '0' || c > '9')
2433 n = (n * 10) + (c - '0');
2436 if (start == text->pos)
2438 sys_warn (r, text->start,
2439 _("Expecting digit at offset %zu in MRSETS record."),
2444 if (!text_match (text, ' '))
2446 sys_warn (r, text->start,
2447 _("Expecting space at offset %zu in MRSETS record."),
2452 if (text->pos + n > text->buffer.length)
2454 sys_warn (r, text->start,
2455 _("%zu-byte string starting at offset %zu "
2456 "exceeds record length %zu."),
2457 n, text->pos, text->buffer.length);
2461 s = &text->buffer.string[text->pos];
2464 sys_warn (r, text->start,
2465 _("Expecting space at offset %zu following %zu-byte string."),
2475 text_match (struct text_record *text, char c)
2477 if (text->buffer.string[text->pos] == c)
2486 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2487 inside the TEXT's string. */
2489 text_pos (const struct text_record *text)
2496 /* Displays a corruption message. */
2498 sys_msg (struct sfm_reader *r, off_t offset,
2499 int class, const char *format, va_list args)
2504 ds_init_empty (&text);
2506 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2507 fh_get_file_name (r->fh), (long long int) offset);
2509 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2510 ds_put_vformat (&text, format, args);
2512 m.category = msg_class_to_category (class);
2513 m.severity = msg_class_to_severity (class);
2519 m.text = ds_cstr (&text);
2524 /* Displays a warning for offset OFFSET in the file. */
2526 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2530 va_start (args, format);
2531 sys_msg (r, offset, MW, format, args);
2535 /* Displays an error for the current file position,
2536 marks it as in an error state,
2537 and aborts reading it using longjmp. */
2539 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2543 va_start (args, format);
2544 sys_msg (r, offset, ME, format, args);
2548 longjmp (r->bail_out, 1);
2551 /* Reads BYTE_CNT bytes into BUF.
2552 Returns true if exactly BYTE_CNT bytes are successfully read.
2553 Aborts if an I/O error or a partial read occurs.
2554 If EOF_IS_OK, then an immediate end-of-file causes false to be
2555 returned; otherwise, immediate end-of-file causes an abort
2558 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2559 void *buf, size_t byte_cnt)
2561 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2562 r->pos += bytes_read;
2563 if (bytes_read == byte_cnt)
2565 else if (ferror (r->file))
2566 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2567 else if (!eof_is_ok || bytes_read != 0)
2568 sys_error (r, r->pos, _("Unexpected end of file."));
2573 /* Reads BYTE_CNT into BUF.
2574 Aborts upon I/O error or if end-of-file is encountered. */
2576 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2578 read_bytes_internal (r, false, buf, byte_cnt);
2581 /* Reads BYTE_CNT bytes into BUF.
2582 Returns true if exactly BYTE_CNT bytes are successfully read.
2583 Returns false if an immediate end-of-file is encountered.
2584 Aborts if an I/O error or a partial read occurs. */
2586 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2588 return read_bytes_internal (r, true, buf, byte_cnt);
2591 /* Reads a 32-bit signed integer from R and returns its value in
2594 read_int (struct sfm_reader *r)
2597 read_bytes (r, integer, sizeof integer);
2598 return integer_get (r->integer_format, integer, sizeof integer);
2601 /* Reads a 64-bit floating-point number from R and returns its
2602 value in host format. */
2604 read_float (struct sfm_reader *r)
2607 read_bytes (r, number, sizeof number);
2608 return float_get_double (r->float_format, number);
2612 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2614 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2618 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2620 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2623 /* Reads exactly SIZE - 1 bytes into BUFFER
2624 and stores a null byte into BUFFER[SIZE - 1]. */
2626 read_string (struct sfm_reader *r, char *buffer, size_t size)
2629 read_bytes (r, buffer, size - 1);
2630 buffer[size - 1] = '\0';
2633 /* Skips BYTES bytes forward in R. */
2635 skip_bytes (struct sfm_reader *r, size_t bytes)
2640 size_t chunk = MIN (sizeof buffer, bytes);
2641 read_bytes (r, buffer, chunk);
2646 static const struct casereader_class sys_file_casereader_class =
2648 sys_file_casereader_read,
2649 sys_file_casereader_destroy,