1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
88 /* Fields from the top-level header record. */
89 struct sfm_header_record
91 char magic[5]; /* First 4 bytes of file, then null. */
92 int weight_idx; /* 0 if unweighted, otherwise a var index. */
93 int nominal_case_size; /* Number of var positions. */
95 /* These correspond to the members of struct sfm_file_info or a dictionary
96 but in the system file's encoding rather than ASCII. */
97 char creation_date[10]; /* "dd mmm yy". */
98 char creation_time[9]; /* "hh:mm:ss". */
99 char eye_catcher[61]; /* Eye-catcher string, then product name. */
100 char file_label[65]; /* File label. */
103 struct sfm_var_record
110 int missing_value_code;
113 struct variable *var;
116 struct sfm_value_label
122 struct sfm_value_label_record
125 struct sfm_value_label *labels;
132 struct sfm_document_record
139 struct sfm_extension_record
141 off_t pos; /* Starting offset in file. */
142 size_t size; /* Size of data elements. */
143 size_t count; /* Number of data elements. */
144 void *data; /* Contents. */
147 /* System file reader. */
150 /* Resource tracking. */
151 struct pool *pool; /* All system file state. */
152 jmp_buf bail_out; /* longjmp() target for error handling. */
155 struct file_handle *fh; /* File handle. */
156 struct fh_lock *lock; /* Mutual exclusion for file handle. */
157 FILE *file; /* File stream. */
158 off_t pos; /* Position in file. */
159 bool error; /* I/O or corruption error? */
160 struct caseproto *proto; /* Format of output cases. */
163 enum integer_format integer_format; /* On-disk integer format. */
164 enum float_format float_format; /* On-disk floating point format. */
165 struct sfm_var *sfm_vars; /* Variables. */
166 size_t sfm_var_cnt; /* Number of variables. */
167 casenumber case_cnt; /* Number of cases */
168 const char *encoding; /* String encoding. */
171 bool compressed; /* File is compressed? */
172 double bias; /* Compression bias, usually 100.0. */
173 uint8_t opcodes[8]; /* Current block of opcodes. */
174 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
175 bool corruption_warning; /* Warned about possible corruption? */
178 static const struct casereader_class sys_file_casereader_class;
180 static bool close_reader (struct sfm_reader *);
182 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
183 const struct sfm_var_record *,
186 static void sys_msg (struct sfm_reader *r, off_t, int class,
187 const char *format, va_list args)
188 PRINTF_FORMAT (4, 0);
189 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
190 PRINTF_FORMAT (3, 4);
191 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
195 static void read_bytes (struct sfm_reader *, void *, size_t);
196 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
197 static int read_int (struct sfm_reader *);
198 static double read_float (struct sfm_reader *);
199 static void read_string (struct sfm_reader *, char *, size_t);
200 static void skip_bytes (struct sfm_reader *, size_t);
202 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
203 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
205 static void read_variable_record (struct sfm_reader *,
206 struct sfm_var_record *);
207 static void read_value_label_record (struct sfm_reader *,
208 struct sfm_value_label_record *,
210 static struct sfm_document_record *read_document_record (struct sfm_reader *);
211 static struct sfm_extension_record *read_extension_record (
212 struct sfm_reader *, int subtype);
213 static void skip_extension_record (struct sfm_reader *, int subtype);
215 static const char *choose_encoding (
217 const struct sfm_header_record *,
218 const struct sfm_extension_record *ext_integer,
219 const struct sfm_extension_record *ext_encoding);
221 static struct text_record *open_text_record (
222 struct sfm_reader *, const struct sfm_extension_record *,
223 bool recode_to_utf8);
224 static void close_text_record (struct sfm_reader *,
225 struct text_record *);
226 static bool read_variable_to_value_pair (struct sfm_reader *,
228 struct text_record *,
229 struct variable **var, char **value);
230 static void text_warn (struct sfm_reader *r, struct text_record *text,
231 const char *format, ...)
232 PRINTF_FORMAT (3, 4);
233 static char *text_get_token (struct text_record *,
234 struct substring delimiters, char *delimiter);
235 static bool text_match (struct text_record *, char c);
236 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
237 struct text_record *,
238 struct substring delimiters,
240 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
241 struct text_record *,
242 struct substring delimiters,
244 static const char *text_parse_counted_string (struct sfm_reader *,
245 struct text_record *);
246 static size_t text_pos (const struct text_record *);
248 static bool close_reader (struct sfm_reader *r);
250 /* Dictionary reader. */
258 static void read_header (struct sfm_reader *, struct sfm_read_info *,
259 struct sfm_header_record *);
260 static void parse_header (struct sfm_reader *,
261 const struct sfm_header_record *,
262 struct sfm_read_info *, struct dictionary *);
263 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
264 struct sfm_var_record *, size_t n);
265 static void parse_format_spec (struct sfm_reader *, off_t pos,
266 unsigned int format, enum which_format,
267 struct variable *, int *format_warning_cnt);
268 static void parse_document (struct dictionary *, struct sfm_document_record *);
269 static void parse_display_parameters (struct sfm_reader *,
270 const struct sfm_extension_record *,
271 struct dictionary *);
272 static void parse_machine_integer_info (struct sfm_reader *,
273 const struct sfm_extension_record *,
274 struct sfm_read_info *);
275 static void parse_machine_float_info (struct sfm_reader *,
276 const struct sfm_extension_record *);
277 static void parse_mrsets (struct sfm_reader *,
278 const struct sfm_extension_record *,
279 struct dictionary *);
280 static void parse_long_var_name_map (struct sfm_reader *,
281 const struct sfm_extension_record *,
282 struct dictionary *);
283 static void parse_long_string_map (struct sfm_reader *,
284 const struct sfm_extension_record *,
285 struct dictionary *);
286 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
287 const struct sfm_var_record *,
289 const struct sfm_value_label_record *);
290 static void parse_data_file_attributes (struct sfm_reader *,
291 const struct sfm_extension_record *,
292 struct dictionary *);
293 static void parse_variable_attributes (struct sfm_reader *,
294 const struct sfm_extension_record *,
295 struct dictionary *);
296 static void parse_long_string_value_labels (struct sfm_reader *,
297 const struct sfm_extension_record *,
298 struct dictionary *);
300 /* Frees the strings inside INFO. */
302 sfm_read_info_destroy (struct sfm_read_info *info)
306 free (info->creation_date);
307 free (info->creation_time);
308 free (info->product);
312 /* Opens the system file designated by file handle FH for reading. Reads the
313 system file's dictionary into *DICT.
315 Ordinarily the reader attempts to automatically detect the character
316 encoding based on the file's contents. This isn't always possible,
317 especially for files written by old versions of SPSS or PSPP, so specifying
318 a nonnull ENCODING overrides the choice of character encoding.
320 If INFO is non-null, then it receives additional info about the system file,
321 which the caller must eventually free with sfm_read_info_destroy() when it
322 is no longer needed. */
324 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
325 struct dictionary **dictp, struct sfm_read_info *infop)
327 struct sfm_reader *volatile r = NULL;
328 struct sfm_read_info *volatile info;
330 struct sfm_header_record header;
332 struct sfm_var_record *vars;
333 size_t n_vars, allocated_vars;
335 struct sfm_value_label_record *labels;
336 size_t n_labels, allocated_labels;
338 struct sfm_document_record *document;
340 struct sfm_extension_record *extensions[32];
342 struct dictionary *volatile dict = NULL;
345 /* Create and initialize reader. */
346 r = pool_create_container (struct sfm_reader, pool);
352 r->opcode_idx = sizeof r->opcodes;
353 r->corruption_warning = false;
355 info = infop ? infop : xmalloc (sizeof *info);
356 memset (info, 0, sizeof *info);
358 /* TRANSLATORS: this fragment will be interpolated into
359 messages in fh_lock() that identify types of files. */
360 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
364 r->file = fn_open (fh_get_file_name (fh), "rb");
367 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
368 fh_get_file_name (r->fh), strerror (errno));
372 if (setjmp (r->bail_out))
376 read_header (r, info, &header);
379 n_vars = allocated_vars = 0;
382 n_labels = allocated_labels = 0;
386 memset (extensions, 0, sizeof extensions);
396 read_int (r); /* Skip filler. */
403 if (n_vars >= allocated_vars)
404 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
406 read_variable_record (r, &vars[n_vars++]);
410 if (n_labels >= allocated_labels)
411 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
413 read_value_label_record (r, &labels[n_labels++], n_vars);
417 /* A Type 4 record is always immediately after a type 3 record,
418 so the code for type 3 records reads the type 4 record too. */
419 sys_error (r, r->pos, _("Misplaced type 4 record."));
422 if (document != NULL)
423 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
424 document = read_document_record (r);
428 subtype = read_int (r);
429 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
432 _("Unrecognized record type 7, subtype %d. Please "
433 "send a copy of this file, and the syntax which "
434 "created it to %s."),
435 subtype, PACKAGE_BUGREPORT);
436 skip_extension_record (r, subtype);
438 else if (extensions[subtype] != NULL)
441 _("Record type 7, subtype %d found here has the same "
442 "type as the record found near offset 0x%llx. "
443 "Please send a copy of this file, and the syntax "
444 "which created it to %s."),
445 subtype, (long long int) extensions[subtype]->pos,
447 skip_extension_record (r, subtype);
450 extensions[subtype] = read_extension_record (r, subtype);
454 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
459 /* Now actually parse what we read.
461 First, figure out the correct character encoding, because this determines
462 how the rest of the header data is to be interpreted. */
463 dict = dict_create (encoding
465 : choose_encoding (r, &header, extensions[EXT_INTEGER],
466 extensions[EXT_ENCODING]));
467 r->encoding = dict_get_encoding (dict);
469 /* These records don't use variables at all. */
470 if (document != NULL)
471 parse_document (dict, document);
473 if (extensions[EXT_INTEGER] != NULL)
474 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
476 if (extensions[EXT_FLOAT] != NULL)
477 parse_machine_float_info (r, extensions[EXT_FLOAT]);
479 if (extensions[EXT_FILE_ATTRS] != NULL)
480 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
482 parse_header (r, &header, info, dict);
484 /* Parse the variable records, the basis of almost everything else. */
485 parse_variable_records (r, dict, vars, n_vars);
487 /* Parse value labels and the weight variable immediately after the variable
488 records. These records use indexes into var_recs[], so we must parse them
489 before those indexes become invalidated by very long string variables. */
490 for (i = 0; i < n_labels; i++)
491 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
492 if (header.weight_idx != 0)
494 struct variable *weight_var;
496 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
498 if (var_is_numeric (weight_var))
499 dict_set_weight (dict, weight_var);
501 sys_error (r, -1, _("Weighting variable must be numeric "
502 "(not string variable `%s')."),
503 var_get_name (weight_var));
506 if (extensions[EXT_DISPLAY] != NULL)
507 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
509 /* The following records use short names, so they need to be parsed before
510 parse_long_var_name_map() changes short names to long names. */
511 if (extensions[EXT_MRSETS] != NULL)
512 parse_mrsets (r, extensions[EXT_MRSETS], dict);
514 if (extensions[EXT_MRSETS2] != NULL)
515 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
517 if (extensions[EXT_LONG_STRINGS] != NULL)
518 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
520 /* Now rename variables to their long names. */
521 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
523 /* The following records use long names, so they need to follow renaming. */
524 if (extensions[EXT_VAR_ATTRS] != NULL)
525 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
527 if (extensions[EXT_LONG_LABELS] != NULL)
528 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
530 /* Warn if the actual amount of data per case differs from the
531 amount that the header claims. SPSS version 13 gets this
532 wrong when very long strings are involved, so don't warn in
534 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
535 && info->version_major != 13)
536 sys_warn (r, -1, _("File header claims %d variable positions but "
537 "%zu were read from file."),
538 header.nominal_case_size, n_vars);
540 /* Create an index of dictionary variable widths for
541 sfm_read_case to use. We cannot use the `struct variable's
542 from the dictionary we created, because the caller owns the
543 dictionary and may destroy or modify its variables. */
544 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
545 pool_register (r->pool, free, r->sfm_vars);
546 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
550 sfm_read_info_destroy (info);
552 return casereader_create_sequential
554 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
555 &sys_file_casereader_class, r);
558 sfm_read_info_destroy (info);
565 /* Closes a system file after we're done with it.
566 Returns true if an I/O error has occurred on READER, false
569 close_reader (struct sfm_reader *r)
578 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
580 msg (ME, _("Error closing system file `%s': %s."),
581 fh_get_file_name (r->fh), strerror (errno));
591 pool_destroy (r->pool);
596 /* Destroys READER. */
598 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
600 struct sfm_reader *r = r_;
604 /* Returns true if FILE is an SPSS system file,
607 sfm_detect (FILE *file)
611 if (fread (magic, 4, 1, file) != 1)
615 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
618 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
619 except for the string fields in *INFO, which parse_header() will initialize
620 later once the file's encoding is known. */
622 read_header (struct sfm_reader *r, struct sfm_read_info *info,
623 struct sfm_header_record *header)
625 uint8_t raw_layout_code[4];
628 read_string (r, header->magic, sizeof header->magic);
629 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
631 if (strcmp (ASCII_MAGIC, header->magic)
632 && strcmp (EBCDIC_MAGIC, header->magic))
633 sys_error (r, 0, _("This is not an SPSS system file."));
635 /* Identify integer format. */
636 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
637 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
639 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
641 || (r->integer_format != INTEGER_MSB_FIRST
642 && r->integer_format != INTEGER_LSB_FIRST))
643 sys_error (r, 64, _("This is not an SPSS system file."));
645 header->nominal_case_size = read_int (r);
646 if (header->nominal_case_size < 0
647 || header->nominal_case_size > INT_MAX / 16)
648 header->nominal_case_size = -1;
650 r->compressed = read_int (r) != 0;
652 header->weight_idx = read_int (r);
654 r->case_cnt = read_int (r);
655 if ( r->case_cnt > INT_MAX / 2)
658 /* Identify floating-point format and obtain compression bias. */
659 read_bytes (r, raw_bias, sizeof raw_bias);
660 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
662 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
664 if (memcmp (raw_bias, zero_bias, 8))
665 sys_warn (r, r->pos - 8,
666 _("Compression bias is not the usual "
667 "value of 100, or system file uses unrecognized "
668 "floating-point format."));
671 /* Some software is known to write all-zeros to this
672 field. Such software also writes floating-point
673 numbers in the format that we expect by default
674 (it seems that all software most likely does, in
675 reality), so don't warn in this case. */
678 if (r->integer_format == INTEGER_MSB_FIRST)
679 r->float_format = FLOAT_IEEE_DOUBLE_BE;
681 r->float_format = FLOAT_IEEE_DOUBLE_LE;
683 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
685 read_string (r, header->creation_date, sizeof header->creation_date);
686 read_string (r, header->creation_time, sizeof header->creation_time);
687 read_string (r, header->file_label, sizeof header->file_label);
690 info->integer_format = r->integer_format;
691 info->float_format = r->float_format;
692 info->compressed = r->compressed;
693 info->case_cnt = r->case_cnt;
696 /* Reads a variable (type 2) record from R into RECORD. */
698 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
700 int has_variable_label;
702 memset (record, 0, sizeof *record);
704 record->pos = r->pos;
705 record->width = read_int (r);
706 has_variable_label = read_int (r);
707 record->missing_value_code = read_int (r);
708 record->print_format = read_int (r);
709 record->write_format = read_int (r);
710 read_bytes (r, record->name, sizeof record->name);
712 if (has_variable_label == 1)
714 enum { MAX_LABEL_LEN = 255 };
715 size_t len, read_len;
719 /* Read up to MAX_LABEL_LEN bytes of label. */
720 read_len = MIN (MAX_LABEL_LEN, len);
721 record->label = pool_malloc (r->pool, read_len + 1);
722 read_string (r, record->label, read_len + 1);
724 /* Skip unread label bytes. */
725 skip_bytes (r, len - read_len);
727 /* Skip label padding up to multiple of 4 bytes. */
728 skip_bytes (r, ROUND_UP (len, 4) - len);
730 else if (has_variable_label != 0)
731 sys_error (r, record->pos,
732 _("Variable label indicator field is not 0 or 1."));
734 /* Set missing values. */
735 if (record->missing_value_code != 0)
737 int code = record->missing_value_code;
738 if (record->width == 0)
740 if (code < -3 || code > 3 || code == -1)
741 sys_error (r, record->pos,
742 _("Numeric missing value indicator field is not "
743 "-3, -2, 0, 1, 2, or 3."));
747 if (code < 1 || code > 3)
748 sys_error (r, record->pos,
749 _("String missing value indicator field is not "
753 read_bytes (r, record->missing, 8 * abs (code));
757 /* Reads value labels from R into RECORD. */
759 read_value_label_record (struct sfm_reader *r,
760 struct sfm_value_label_record *record,
765 /* Read type 3 record. */
766 record->pos = r->pos;
767 record->n_labels = read_int (r);
768 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
769 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
771 record->labels = pool_nmalloc (r->pool, record->n_labels,
772 sizeof *record->labels);
773 for (i = 0; i < record->n_labels; i++)
775 struct sfm_value_label *label = &record->labels[i];
776 unsigned char label_len;
779 read_bytes (r, label->value, sizeof label->value);
781 /* Read label length. */
782 read_bytes (r, &label_len, sizeof label_len);
783 padded_len = ROUND_UP (label_len + 1, 8);
785 /* Read label, padding. */
786 label->label = pool_malloc (r->pool, padded_len + 1);
787 read_bytes (r, label->label, padded_len - 1);
788 label->label[label_len] = '\0';
791 /* Read record type of type 4 record. */
792 if (read_int (r) != 4)
793 sys_error (r, r->pos - 4,
794 _("Variable index record (type 4) does not immediately "
795 "follow value label record (type 3) as it should."));
797 /* Read number of variables associated with value label from type 4
799 record->n_vars = read_int (r);
800 if (record->n_vars < 1 || record->n_vars > n_vars)
801 sys_error (r, r->pos - 4,
802 _("Number of variables associated with a value label (%zu) "
803 "is not between 1 and the number of variables (%zu)."),
804 record->n_vars, n_vars);
805 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
806 for (i = 0; i < record->n_vars; i++)
807 record->vars[i] = read_int (r);
810 /* Reads a document record from R and returns it. */
811 static struct sfm_document_record *
812 read_document_record (struct sfm_reader *r)
814 struct sfm_document_record *record;
817 record = pool_malloc (r->pool, sizeof *record);
818 record->pos = r->pos;
820 n_lines = read_int (r);
821 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
822 sys_error (r, record->pos,
823 _("Number of document lines (%d) "
824 "must be greater than 0 and less than %d."),
825 n_lines, INT_MAX / DOC_LINE_LENGTH);
827 record->n_lines = n_lines;
828 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
829 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
835 read_extension_record_header (struct sfm_reader *r, int subtype,
836 struct sfm_extension_record *record)
838 record->pos = r->pos;
839 record->size = read_int (r);
840 record->count = read_int (r);
842 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
843 allows an extra byte for a null terminator, used by some
844 extension processing routines. */
845 if (record->size != 0
846 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
847 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
850 /* Reads an extension record from R into RECORD. */
851 static struct sfm_extension_record *
852 read_extension_record (struct sfm_reader *r, int subtype)
854 struct extension_record_type
861 static const struct extension_record_type types[] =
863 /* Implemented record types. */
864 { EXT_INTEGER, 4, 8 },
866 { EXT_MRSETS, 1, 0 },
867 { EXT_DISPLAY, 4, 0 },
868 { EXT_LONG_NAMES, 1, 0 },
869 { EXT_LONG_STRINGS, 1, 0 },
870 { EXT_NCASES, 8, 2 },
871 { EXT_FILE_ATTRS, 1, 0 },
872 { EXT_VAR_ATTRS, 1, 0 },
873 { EXT_MRSETS2, 1, 0 },
874 { EXT_ENCODING, 1, 0 },
875 { EXT_LONG_LABELS, 1, 0 },
877 /* Ignored record types. */
878 { EXT_VAR_SETS, 0, 0 },
880 { EXT_DATA_ENTRY, 0, 0 },
883 const struct extension_record_type *type;
884 struct sfm_extension_record *record;
887 record = pool_malloc (r->pool, sizeof *record);
888 read_extension_record_header (r, subtype, record);
889 n_bytes = record->count * record->size;
891 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
892 if (subtype == type->subtype)
894 if (type->size > 0 && record->size != type->size)
895 sys_warn (r, record->pos,
896 _("Record type 7, subtype %d has bad size %zu "
897 "(expected %d)."), subtype, record->size, type->size);
898 else if (type->count > 0 && record->count != type->count)
899 sys_warn (r, record->pos,
900 _("Record type 7, subtype %d has bad count %zu "
901 "(expected %d)."), subtype, record->count, type->count);
902 else if (type->count == 0 && type->size == 0)
904 /* Ignore this record. */
908 char *data = pool_malloc (r->pool, n_bytes + 1);
909 data[n_bytes] = '\0';
912 read_bytes (r, record->data, n_bytes);
919 sys_warn (r, record->pos,
920 _("Unrecognized record type 7, subtype %d. Please send a "
921 "copy of this file, and the syntax which created it to %s."),
922 subtype, PACKAGE_BUGREPORT);
925 skip_bytes (r, n_bytes);
930 skip_extension_record (struct sfm_reader *r, int subtype)
932 struct sfm_extension_record record;
934 read_extension_record_header (r, subtype, &record);
935 skip_bytes (r, record.count * record.size);
939 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
940 struct sfm_read_info *info, struct dictionary *dict)
942 const char *dict_encoding = dict_get_encoding (dict);
943 struct substring product;
944 struct substring label;
946 /* Convert file label to UTF-8 and put it into DICT. */
947 label = recode_substring_pool ("UTF-8", dict_encoding,
948 ss_cstr (header->file_label), r->pool);
949 ss_trim (&label, ss_cstr (" "));
950 label.string[label.length] = '\0';
951 dict_set_label (dict, label.string);
953 /* Put creation date and time in UTF-8 into INFO. */
954 info->creation_date = recode_string ("UTF-8", dict_encoding,
955 header->creation_date, -1);
956 info->creation_time = recode_string ("UTF-8", dict_encoding,
957 header->creation_time, -1);
959 /* Put product name into INFO, dropping eye-catcher string if present. */
960 product = recode_substring_pool ("UTF-8", dict_encoding,
961 ss_cstr (header->eye_catcher), r->pool);
962 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
963 ss_trim (&product, ss_cstr (" "));
964 info->product = ss_xstrdup (product);
967 /* Reads a variable (type 2) record from R and adds the
968 corresponding variable to DICT.
969 Also skips past additional variable records for long string
972 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
973 struct sfm_var_record *var_recs, size_t n_var_recs)
975 const char *dict_encoding = dict_get_encoding (dict);
976 struct sfm_var_record *rec;
979 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
981 struct variable *var;
986 name = recode_string_pool ("UTF-8", dict_encoding,
987 rec->name, 8, r->pool);
988 name[strcspn (name, " ")] = '\0';
990 if (!dict_id_is_valid (dict, name, false)
991 || name[0] == '$' || name[0] == '#')
992 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
994 if (rec->width < 0 || rec->width > 255)
995 sys_error (r, rec->pos,
996 _("Bad width %d for variable %s."), rec->width, name);
998 var = rec->var = dict_create_var (dict, name, rec->width);
1000 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
1002 /* Set the short name the same as the long name. */
1003 var_set_short_name (var, 0, name);
1005 /* Get variable label, if any. */
1010 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1011 rec->label, -1, r->pool);
1012 var_set_label (var, utf8_label, false);
1015 /* Set missing values. */
1016 if (rec->missing_value_code != 0)
1018 int width = var_get_width (var);
1019 struct missing_values mv;
1021 mv_init_pool (r->pool, &mv, width);
1022 if (var_is_numeric (var))
1024 bool has_range = rec->missing_value_code < 0;
1025 int n_discrete = (has_range
1026 ? rec->missing_value_code == -3
1027 : rec->missing_value_code);
1032 double low = parse_float (r, rec->missing, 0);
1033 double high = parse_float (r, rec->missing, 8);
1034 mv_add_range (&mv, low, high);
1038 for (i = 0; i < n_discrete; i++)
1040 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1048 value_init_pool (r->pool, &value, width);
1049 value_set_missing (&value, width);
1050 for (i = 0; i < rec->missing_value_code; i++)
1052 uint8_t *s = value_str_rw (&value, width);
1053 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1054 mv_add_str (&mv, s);
1057 var_set_missing_values (var, &mv);
1061 parse_format_spec (r, rec->pos + 12, rec->print_format,
1062 PRINT_FORMAT, var, &n_warnings);
1063 parse_format_spec (r, rec->pos + 16, rec->write_format,
1064 WRITE_FORMAT, var, &n_warnings);
1066 /* Account for values.
1067 Skip long string continuation records, if any. */
1068 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1069 for (i = 1; i < n_values; i++)
1070 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1071 sys_error (r, rec->pos, _("Missing string continuation record."));
1076 /* Translates the format spec from sysfile format to internal
1079 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1080 enum which_format which, struct variable *v,
1083 const int max_warnings = 8;
1084 uint8_t raw_type = format >> 16;
1085 uint8_t w = format >> 8;
1094 ok = (fmt_from_io (raw_type, &f.type)
1095 && fmt_check_output (&f)
1096 && fmt_check_width_compat (&f, var_get_width (v)));
1101 if (which == PRINT_FORMAT)
1102 var_set_print_format (v, &f);
1104 var_set_write_format (v, &f);
1106 else if (format == 0)
1108 /* Actually observed in the wild. No point in warning about it. */
1110 else if (++*n_warnings <= max_warnings)
1112 if (which == PRINT_FORMAT)
1113 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1115 var_get_name (v), var_get_width (v), format);
1117 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1119 var_get_name (v), var_get_width (v), format);
1121 if (*n_warnings == max_warnings)
1122 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1127 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1131 for (p = record->documents;
1132 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1133 p += DOC_LINE_LENGTH)
1135 struct substring line;
1137 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1138 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1139 ss_rtrim (&line, ss_cstr (" "));
1140 line.string[line.length] = '\0';
1142 dict_add_document_line (dict, line.string, false);
1148 /* Parses record type 7, subtype 3. */
1150 parse_machine_integer_info (struct sfm_reader *r,
1151 const struct sfm_extension_record *record,
1152 struct sfm_read_info *info)
1154 int float_representation, expected_float_format;
1155 int integer_representation, expected_integer_format;
1157 /* Save version info. */
1158 info->version_major = parse_int (r, record->data, 0);
1159 info->version_minor = parse_int (r, record->data, 4);
1160 info->version_revision = parse_int (r, record->data, 8);
1162 /* Check floating point format. */
1163 float_representation = parse_int (r, record->data, 16);
1164 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1165 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1166 expected_float_format = 1;
1167 else if (r->float_format == FLOAT_Z_LONG)
1168 expected_float_format = 2;
1169 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1170 expected_float_format = 3;
1173 if (float_representation != expected_float_format)
1174 sys_error (r, record->pos, _("Floating-point representation indicated by "
1175 "system file (%d) differs from expected (%d)."),
1176 float_representation, expected_float_format);
1178 /* Check integer format. */
1179 integer_representation = parse_int (r, record->data, 24);
1180 if (r->integer_format == INTEGER_MSB_FIRST)
1181 expected_integer_format = 1;
1182 else if (r->integer_format == INTEGER_LSB_FIRST)
1183 expected_integer_format = 2;
1186 if (integer_representation != expected_integer_format)
1187 sys_warn (r, record->pos,
1188 _("Integer format indicated by system file (%d) "
1189 "differs from expected (%d)."),
1190 integer_representation, expected_integer_format);
1195 choose_encoding (struct sfm_reader *r,
1196 const struct sfm_header_record *header,
1197 const struct sfm_extension_record *ext_integer,
1198 const struct sfm_extension_record *ext_encoding)
1200 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1203 return ext_encoding->data;
1205 /* But EXT_INTEGER is better than nothing as a fallback. */
1208 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1209 const char *encoding;
1218 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1219 respectively. However, there are known to be many files in the wild
1220 with character code 2, yet have data which are clearly not ASCII.
1221 Therefore we ignore these values. */
1228 encoding = sys_get_encoding_from_codepage (codepage);
1229 if (encoding != NULL)
1235 /* If the file magic number is EBCDIC then its character data is too. */
1236 if (!strcmp (header->magic, EBCDIC_MAGIC))
1239 return locale_charset ();
1242 /* Parses record type 7, subtype 4. */
1244 parse_machine_float_info (struct sfm_reader *r,
1245 const struct sfm_extension_record *record)
1247 double sysmis = parse_float (r, record->data, 0);
1248 double highest = parse_float (r, record->data, 8);
1249 double lowest = parse_float (r, record->data, 16);
1251 if (sysmis != SYSMIS)
1252 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1255 if (highest != HIGHEST)
1256 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1257 highest, "HIGHEST");
1259 if (lowest != LOWEST)
1260 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1264 /* Parses record type 7, subtype 7 or 19. */
1266 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1267 struct dictionary *dict)
1269 struct text_record *text;
1270 struct mrset *mrset;
1272 text = open_text_record (r, record, false);
1275 const char *counted = NULL;
1278 struct stringi_set var_names;
1279 size_t allocated_vars;
1283 mrset = xzalloc (sizeof *mrset);
1285 name = text_get_token (text, ss_cstr ("="), NULL);
1288 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1290 if (mrset->name[0] != '$')
1292 sys_warn (r, record->pos,
1293 _("`%s' does not begin with `$' at offset %zu "
1294 "in MRSETS record."), mrset->name, text_pos (text));
1298 if (text_match (text, 'C'))
1300 mrset->type = MRSET_MC;
1301 if (!text_match (text, ' '))
1303 sys_warn (r, record->pos,
1304 _("Missing space following `%c' at offset %zu "
1305 "in MRSETS record."), 'C', text_pos (text));
1309 else if (text_match (text, 'D'))
1311 mrset->type = MRSET_MD;
1312 mrset->cat_source = MRSET_VARLABELS;
1314 else if (text_match (text, 'E'))
1318 mrset->type = MRSET_MD;
1319 mrset->cat_source = MRSET_COUNTEDVALUES;
1320 if (!text_match (text, ' '))
1322 sys_warn (r, record->pos,
1323 _("Missing space following `%c' at offset %zu "
1324 "in MRSETS record."), 'E', text_pos (text));
1328 number = text_get_token (text, ss_cstr (" "), NULL);
1329 if (!strcmp (number, "11"))
1330 mrset->label_from_var_label = true;
1331 else if (strcmp (number, "1"))
1332 sys_warn (r, record->pos,
1333 _("Unexpected label source value `%s' following `E' "
1334 "at offset %zu in MRSETS record."),
1335 number, text_pos (text));
1339 sys_warn (r, record->pos,
1340 _("Missing `C', `D', or `E' at offset %zu "
1341 "in MRSETS record."),
1346 if (mrset->type == MRSET_MD)
1348 counted = text_parse_counted_string (r, text);
1349 if (counted == NULL)
1353 label = text_parse_counted_string (r, text);
1356 if (label[0] != '\0')
1357 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1359 stringi_set_init (&var_names);
1364 const char *raw_var_name;
1365 struct variable *var;
1368 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1369 if (raw_var_name == NULL)
1371 sys_warn (r, record->pos,
1372 _("Missing new-line parsing variable names "
1373 "at offset %zu in MRSETS record."),
1377 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1379 var = dict_lookup_var (dict, var_name);
1385 if (!stringi_set_insert (&var_names, var_name))
1387 sys_warn (r, record->pos,
1388 _("Duplicate variable name %s "
1389 "at offset %zu in MRSETS record."),
1390 var_name, text_pos (text));
1396 if (mrset->label == NULL && mrset->label_from_var_label
1397 && var_has_label (var))
1398 mrset->label = xstrdup (var_get_label (var));
1401 && var_get_type (var) != var_get_type (mrset->vars[0]))
1403 sys_warn (r, record->pos,
1404 _("MRSET %s contains both string and "
1405 "numeric variables."), name);
1408 width = MIN (width, var_get_width (var));
1410 if (mrset->n_vars >= allocated_vars)
1411 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1412 sizeof *mrset->vars);
1413 mrset->vars[mrset->n_vars++] = var;
1415 while (delimiter != '\n');
1417 if (mrset->n_vars < 2)
1419 sys_warn (r, record->pos,
1420 _("MRSET %s has only %zu variables."), mrset->name,
1422 mrset_destroy (mrset);
1423 stringi_set_destroy (&var_names);
1427 if (mrset->type == MRSET_MD)
1429 mrset->width = width;
1430 value_init (&mrset->counted, width);
1432 mrset->counted.f = strtod (counted, NULL);
1434 value_copy_str_rpad (&mrset->counted, width,
1435 (const uint8_t *) counted, ' ');
1438 dict_add_mrset (dict, mrset);
1440 stringi_set_destroy (&var_names);
1442 mrset_destroy (mrset);
1443 close_text_record (r, text);
1446 /* Read record type 7, subtype 11, which specifies how variables
1447 should be displayed in GUI environments. */
1449 parse_display_parameters (struct sfm_reader *r,
1450 const struct sfm_extension_record *record,
1451 struct dictionary *dict)
1453 bool includes_width;
1454 bool warned = false;
1459 n_vars = dict_get_var_cnt (dict);
1460 if (record->count == 3 * n_vars)
1461 includes_width = true;
1462 else if (record->count == 2 * n_vars)
1463 includes_width = false;
1466 sys_warn (r, record->pos,
1467 _("Extension 11 has bad count %zu (for %zu variables)."),
1468 record->count, n_vars);
1473 for (i = 0; i < n_vars; ++i)
1475 struct variable *v = dict_get_var (dict, i);
1476 int measure, width, align;
1478 measure = parse_int (r, record->data, ofs);
1483 width = parse_int (r, record->data, ofs);
1489 align = parse_int (r, record->data, ofs);
1492 /* SPSS 14 sometimes seems to set string variables' measure
1494 if (0 == measure && var_is_alpha (v))
1497 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1500 sys_warn (r, record->pos,
1501 _("Invalid variable display parameters for variable "
1502 "%zu (%s). Default parameters substituted."),
1503 i, var_get_name (v));
1508 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1509 : measure == 2 ? MEASURE_ORDINAL
1511 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1512 : align == 1 ? ALIGN_RIGHT
1515 /* Older versions (SPSS 9.0) sometimes set the display
1516 width to zero. This causes confusion in the GUI, so
1517 only set the width if it is nonzero. */
1519 var_set_display_width (v, width);
1524 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1525 const char *new_name)
1527 size_t n_short_names;
1531 /* Renaming a variable may clear its short names, but we
1532 want to retain them, so we save them and re-set them
1534 n_short_names = var_get_short_name_cnt (var);
1535 short_names = xnmalloc (n_short_names, sizeof *short_names);
1536 for (i = 0; i < n_short_names; i++)
1538 const char *s = var_get_short_name (var, i);
1539 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1542 /* Set long name. */
1543 dict_rename_var (dict, var, new_name);
1545 /* Restore short names. */
1546 for (i = 0; i < n_short_names; i++)
1548 var_set_short_name (var, i, short_names[i]);
1549 free (short_names[i]);
1554 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1555 to each short name. Modifies variable names in DICT accordingly. */
1557 parse_long_var_name_map (struct sfm_reader *r,
1558 const struct sfm_extension_record *record,
1559 struct dictionary *dict)
1561 struct text_record *text;
1562 struct variable *var;
1567 /* Convert variable names to lowercase. */
1570 for (i = 0; i < dict_get_var_cnt (dict); i++)
1572 struct variable *var = dict_get_var (dict, i);
1575 new_name = xstrdup (var_get_name (var));
1576 str_lowercase (new_name);
1578 rename_var_and_save_short_names (dict, var, new_name);
1586 /* Rename each of the variables, one by one. (In a correctly constructed
1587 system file, this cannot create any intermediate duplicate variable names,
1588 because all of the new variable names are longer than any of the old
1589 variable names and thus there cannot be any overlaps.) */
1590 text = open_text_record (r, record, true);
1591 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1593 /* Validate long name. */
1594 if (!dict_id_is_valid (dict, long_name, false))
1596 sys_warn (r, record->pos,
1597 _("Long variable mapping from %s to invalid "
1598 "variable name `%s'."),
1599 var_get_name (var), long_name);
1603 /* Identify any duplicates. */
1604 if (strcasecmp (var_get_short_name (var, 0), long_name)
1605 && dict_lookup_var (dict, long_name) != NULL)
1607 sys_warn (r, record->pos,
1608 _("Duplicate long variable name `%s'."), long_name);
1612 rename_var_and_save_short_names (dict, var, long_name);
1614 close_text_record (r, text);
1617 /* Reads record type 7, subtype 14, which gives the real length
1618 of each very long string. Rearranges DICT accordingly. */
1620 parse_long_string_map (struct sfm_reader *r,
1621 const struct sfm_extension_record *record,
1622 struct dictionary *dict)
1624 struct text_record *text;
1625 struct variable *var;
1628 text = open_text_record (r, record, true);
1629 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1631 size_t idx = var_get_dict_index (var);
1637 length = strtol (length_s, NULL, 10);
1638 if (length < 1 || length > MAX_STRING)
1640 sys_warn (r, record->pos,
1641 _("%s listed as string of invalid length %s "
1642 "in very long string record."),
1643 var_get_name (var), length_s);
1647 /* Check segments. */
1648 segment_cnt = sfm_width_to_segments (length);
1649 if (segment_cnt == 1)
1651 sys_warn (r, record->pos,
1652 _("%s listed in very long string record with width %s, "
1653 "which requires only one segment."),
1654 var_get_name (var), length_s);
1657 if (idx + segment_cnt > dict_get_var_cnt (dict))
1658 sys_error (r, record->pos,
1659 _("Very long string %s overflows dictionary."),
1660 var_get_name (var));
1662 /* Get the short names from the segments and check their
1664 for (i = 0; i < segment_cnt; i++)
1666 struct variable *seg = dict_get_var (dict, idx + i);
1667 int alloc_width = sfm_segment_alloc_width (length, i);
1668 int width = var_get_width (seg);
1671 var_set_short_name (var, i, var_get_short_name (seg, 0));
1672 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1673 sys_error (r, record->pos,
1674 _("Very long string with width %ld has segment %d "
1675 "of width %d (expected %d)."),
1676 length, i, width, alloc_width);
1678 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1679 var_set_width (var, length);
1681 close_text_record (r, text);
1682 dict_compact_values (dict);
1686 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1687 const struct sfm_var_record *var_recs, size_t n_var_recs,
1688 const struct sfm_value_label_record *record)
1690 struct variable **vars;
1694 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1695 for (i = 0; i < record->n_labels; i++)
1696 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1697 record->labels[i].label, -1,
1700 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1701 for (i = 0; i < record->n_vars; i++)
1702 vars[i] = lookup_var_by_index (r, record->pos,
1703 var_recs, n_var_recs, record->vars[i]);
1705 for (i = 1; i < record->n_vars; i++)
1706 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1707 sys_error (r, record->pos,
1708 _("Variables associated with value label are not all of "
1709 "identical type. Variable %s is %s, but variable "
1711 var_get_name (vars[0]),
1712 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1713 var_get_name (vars[i]),
1714 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1716 for (i = 0; i < record->n_vars; i++)
1718 struct variable *var = vars[i];
1722 width = var_get_width (var);
1724 sys_error (r, record->pos,
1725 _("Value labels may not be added to long string "
1726 "variables (e.g. %s) using records types 3 and 4."),
1727 var_get_name (var));
1729 for (j = 0; j < record->n_labels; j++)
1731 struct sfm_value_label *label = &record->labels[j];
1734 value_init (&value, width);
1736 value.f = parse_float (r, label->value, 0);
1738 memcpy (value_str_rw (&value, width), label->value, width);
1740 if (!var_add_value_label (var, &value, utf8_labels[j]))
1742 if (var_is_numeric (var))
1743 sys_warn (r, record->pos,
1744 _("Duplicate value label for %g on %s."),
1745 value.f, var_get_name (var));
1747 sys_warn (r, record->pos,
1748 _("Duplicate value label for `%.*s' on %s."),
1749 width, value_str (&value, width),
1750 var_get_name (var));
1753 value_destroy (&value, width);
1757 pool_free (r->pool, vars);
1758 for (i = 0; i < record->n_labels; i++)
1759 pool_free (r->pool, utf8_labels[i]);
1760 pool_free (r->pool, utf8_labels);
1763 static struct variable *
1764 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1765 const struct sfm_var_record *var_recs, size_t n_var_recs,
1768 const struct sfm_var_record *rec;
1770 if (idx < 1 || idx > n_var_recs)
1772 sys_error (r, offset,
1773 _("Variable index %d not in valid range 1...%zu."),
1778 rec = &var_recs[idx - 1];
1779 if (rec->var == NULL)
1781 sys_error (r, offset,
1782 _("Variable index %d refers to long string continuation."),
1790 /* Parses a set of custom attributes from TEXT into ATTRS.
1791 ATTRS may be a null pointer, in which case the attributes are
1792 read but discarded. */
1794 parse_attributes (struct sfm_reader *r, struct text_record *text,
1795 struct attrset *attrs)
1799 struct attribute *attr;
1803 /* Parse the key. */
1804 key = text_get_token (text, ss_cstr ("("), NULL);
1808 attr = attribute_create (key);
1809 for (index = 1; ; index++)
1811 /* Parse the value. */
1815 value = text_get_token (text, ss_cstr ("\n"), NULL);
1818 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1823 length = strlen (value);
1824 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1826 value[length - 1] = '\0';
1827 attribute_add_value (attr, value + 1);
1832 _("Attribute value %s[%d] is not quoted: %s."),
1834 attribute_add_value (attr, value);
1837 /* Was this the last value for this attribute? */
1838 if (text_match (text, ')'))
1842 attrset_add (attrs, attr);
1844 attribute_destroy (attr);
1846 while (!text_match (text, '/'));
1849 /* Reads record type 7, subtype 17, which lists custom
1850 attributes on the data file. */
1852 parse_data_file_attributes (struct sfm_reader *r,
1853 const struct sfm_extension_record *record,
1854 struct dictionary *dict)
1856 struct text_record *text = open_text_record (r, record, true);
1857 parse_attributes (r, text, dict_get_attributes (dict));
1858 close_text_record (r, text);
1861 /* Parses record type 7, subtype 18, which lists custom
1862 attributes on individual variables. */
1864 parse_variable_attributes (struct sfm_reader *r,
1865 const struct sfm_extension_record *record,
1866 struct dictionary *dict)
1868 struct text_record *text;
1869 struct variable *var;
1871 text = open_text_record (r, record, true);
1872 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1873 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1874 close_text_record (r, text);
1878 check_overflow (struct sfm_reader *r,
1879 const struct sfm_extension_record *record,
1880 size_t ofs, size_t length)
1882 size_t end = record->size * record->count;
1883 if (length >= end || ofs + length > end)
1884 sys_error (r, record->pos + end,
1885 _("Long string value label record ends unexpectedly."));
1889 parse_long_string_value_labels (struct sfm_reader *r,
1890 const struct sfm_extension_record *record,
1891 struct dictionary *dict)
1893 const char *dict_encoding = dict_get_encoding (dict);
1894 size_t end = record->size * record->count;
1901 struct variable *var;
1906 /* Parse variable name length. */
1907 check_overflow (r, record, ofs, 4);
1908 var_name_len = parse_int (r, record->data, ofs);
1911 /* Parse variable name, width, and number of labels. */
1912 check_overflow (r, record, ofs, var_name_len + 8);
1913 var_name = recode_string_pool ("UTF-8", dict_encoding,
1914 (const char *) record->data + ofs,
1915 var_name_len, r->pool);
1916 width = parse_int (r, record->data, ofs + var_name_len);
1917 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1918 ofs += var_name_len + 8;
1920 /* Look up 'var' and validate. */
1921 var = dict_lookup_var (dict, var_name);
1923 sys_warn (r, record->pos + ofs,
1924 _("Ignoring long string value record for "
1925 "unknown variable %s."), var_name);
1926 else if (var_is_numeric (var))
1928 sys_warn (r, record->pos + ofs,
1929 _("Ignoring long string value record for "
1930 "numeric variable %s."), var_name);
1933 else if (width != var_get_width (var))
1935 sys_warn (r, record->pos + ofs,
1936 _("Ignoring long string value record for variable %s "
1937 "because the record's width (%d) does not match the "
1938 "variable's width (%d)."),
1939 var_name, width, var_get_width (var));
1944 value_init_pool (r->pool, &value, width);
1945 for (i = 0; i < n_labels; i++)
1947 size_t value_length, label_length;
1948 bool skip = var == NULL;
1950 /* Parse value length. */
1951 check_overflow (r, record, ofs, 4);
1952 value_length = parse_int (r, record->data, ofs);
1956 check_overflow (r, record, ofs, value_length);
1959 if (value_length == width)
1960 memcpy (value_str_rw (&value, width),
1961 (const uint8_t *) record->data + ofs, width);
1964 sys_warn (r, record->pos + ofs,
1965 _("Ignoring long string value %zu for variable "
1966 "%s, with width %d, that has bad value "
1968 i, var_get_name (var), width, value_length);
1972 ofs += value_length;
1974 /* Parse label length. */
1975 check_overflow (r, record, ofs, 4);
1976 label_length = parse_int (r, record->data, ofs);
1980 check_overflow (r, record, ofs, label_length);
1985 label = recode_string_pool ("UTF-8", dict_encoding,
1986 (const char *) record->data + ofs,
1987 label_length, r->pool);
1988 if (!var_add_value_label (var, &value, label))
1989 sys_warn (r, record->pos + ofs,
1990 _("Duplicate value label for `%.*s' on %s."),
1991 width, value_str (&value, width),
1992 var_get_name (var));
1993 pool_free (r->pool, label);
1995 ofs += label_length;
2002 static void partial_record (struct sfm_reader *r)
2005 static void read_error (struct casereader *, const struct sfm_reader *);
2007 static bool read_case_number (struct sfm_reader *, double *);
2008 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2009 static int read_opcode (struct sfm_reader *);
2010 static bool read_compressed_number (struct sfm_reader *, double *);
2011 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2012 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2013 static bool skip_whole_strings (struct sfm_reader *, size_t);
2015 /* Reads and returns one case from READER's file. Returns a null
2016 pointer if not successful. */
2017 static struct ccase *
2018 sys_file_casereader_read (struct casereader *reader, void *r_)
2020 struct sfm_reader *r = r_;
2021 struct ccase *volatile c;
2027 c = case_create (r->proto);
2028 if (setjmp (r->bail_out))
2030 casereader_force_error (reader);
2035 for (i = 0; i < r->sfm_var_cnt; i++)
2037 struct sfm_var *sv = &r->sfm_vars[i];
2038 union value *v = case_data_rw_idx (c, sv->case_index);
2040 if (sv->var_width == 0)
2042 if (!read_case_number (r, &v->f))
2047 uint8_t *s = value_str_rw (v, sv->var_width);
2048 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2050 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2059 if (r->case_cnt != -1)
2060 read_error (reader, r);
2065 /* Issues an error that R ends in a partial record. */
2067 partial_record (struct sfm_reader *r)
2069 sys_error (r, r->pos, _("File ends in partial case."));
2072 /* Issues an error that an unspecified error occurred SFM, and
2075 read_error (struct casereader *r, const struct sfm_reader *sfm)
2077 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2078 casereader_force_error (r);
2081 /* Reads a number from R and stores its value in *D.
2082 If R is compressed, reads a compressed number;
2083 otherwise, reads a number in the regular way.
2084 Returns true if successful, false if end of file is
2085 reached immediately. */
2087 read_case_number (struct sfm_reader *r, double *d)
2092 if (!try_read_bytes (r, number, sizeof number))
2094 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2098 return read_compressed_number (r, d);
2101 /* Reads LENGTH string bytes from R into S.
2102 Always reads a multiple of 8 bytes; if LENGTH is not a
2103 multiple of 8, then extra bytes are read and discarded without
2105 Reads compressed strings if S is compressed.
2106 Returns true if successful, false if end of file is
2107 reached immediately. */
2109 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2111 size_t whole = ROUND_DOWN (length, 8);
2112 size_t partial = length % 8;
2116 if (!read_whole_strings (r, s, whole))
2123 if (!read_whole_strings (r, bounce, sizeof bounce))
2129 memcpy (s + whole, bounce, partial);
2135 /* Reads and returns the next compression opcode from R. */
2137 read_opcode (struct sfm_reader *r)
2139 assert (r->compressed);
2143 if (r->opcode_idx >= sizeof r->opcodes)
2145 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2149 opcode = r->opcodes[r->opcode_idx++];
2156 /* Reads a compressed number from R and stores its value in D.
2157 Returns true if successful, false if end of file is
2158 reached immediately. */
2160 read_compressed_number (struct sfm_reader *r, double *d)
2162 int opcode = read_opcode (r);
2170 *d = read_float (r);
2174 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2175 if (!r->corruption_warning)
2177 r->corruption_warning = true;
2178 sys_warn (r, r->pos,
2179 _("Possible compressed data corruption: "
2180 "compressed spaces appear in numeric field."));
2189 *d = opcode - r->bias;
2196 /* Reads a compressed 8-byte string segment from R and stores it
2198 Returns true if successful, false if end of file is
2199 reached immediately. */
2201 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2203 int opcode = read_opcode (r);
2211 read_bytes (r, dst, 8);
2215 memset (dst, ' ', 8);
2220 double value = opcode - r->bias;
2221 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2224 /* This has actually been seen "in the wild". The submitter of the
2225 file that showed that the contents decoded as spaces, but they
2226 were at the end of the field so it's possible that the null
2227 bytes just acted as null terminators. */
2229 else if (!r->corruption_warning)
2231 r->corruption_warning = true;
2232 sys_warn (r, r->pos,
2233 _("Possible compressed data corruption: "
2234 "string contains compressed integer (opcode %d)."),
2244 /* Reads LENGTH string bytes from R into S.
2245 LENGTH must be a multiple of 8.
2246 Reads compressed strings if S is compressed.
2247 Returns true if successful, false if end of file is
2248 reached immediately. */
2250 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2252 assert (length % 8 == 0);
2254 return try_read_bytes (r, s, length);
2258 for (ofs = 0; ofs < length; ofs += 8)
2259 if (!read_compressed_string (r, s + ofs))
2269 /* Skips LENGTH string bytes from R.
2270 LENGTH must be a multiple of 8.
2271 (LENGTH is also limited to 1024, but that's only because the
2272 current caller never needs more than that many bytes.)
2273 Returns true if successful, false if end of file is
2274 reached immediately. */
2276 skip_whole_strings (struct sfm_reader *r, size_t length)
2278 uint8_t buffer[1024];
2279 assert (length < sizeof buffer);
2280 return read_whole_strings (r, buffer, length);
2283 /* Helpers for reading records that contain structured text
2286 /* Maximum number of warnings to issue for a single text
2288 #define MAX_TEXT_WARNINGS 5
2293 struct substring buffer; /* Record contents. */
2294 off_t start; /* Starting offset in file. */
2295 size_t pos; /* Current position in buffer. */
2296 int n_warnings; /* Number of warnings issued or suppressed. */
2297 bool recoded; /* Recoded into UTF-8? */
2300 static struct text_record *
2301 open_text_record (struct sfm_reader *r,
2302 const struct sfm_extension_record *record,
2303 bool recode_to_utf8)
2305 struct text_record *text;
2306 struct substring raw;
2308 text = pool_alloc (r->pool, sizeof *text);
2309 raw = ss_buffer (record->data, record->size * record->count);
2310 text->start = record->pos;
2311 text->buffer = (recode_to_utf8
2312 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2315 text->n_warnings = 0;
2316 text->recoded = recode_to_utf8;
2321 /* Closes TEXT, frees its storage, and issues a final warning
2322 about suppressed warnings if necesary. */
2324 close_text_record (struct sfm_reader *r, struct text_record *text)
2326 if (text->n_warnings > MAX_TEXT_WARNINGS)
2327 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2328 text->n_warnings - MAX_TEXT_WARNINGS);
2330 pool_free (r->pool, ss_data (text->buffer));
2333 /* Reads a variable=value pair from TEXT.
2334 Looks up the variable in DICT and stores it into *VAR.
2335 Stores a null-terminated value into *VALUE. */
2337 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2338 struct text_record *text,
2339 struct variable **var, char **value)
2343 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2346 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2350 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2351 ss_buffer ("\t\0", 2));
2359 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2360 struct text_record *text, struct substring delimiters,
2361 struct variable **var)
2365 name = text_get_token (text, delimiters, NULL);
2369 *var = dict_lookup_var (dict, name);
2373 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2380 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2381 struct text_record *text, struct substring delimiters,
2382 struct variable **var)
2384 char *short_name = text_get_token (text, delimiters, NULL);
2385 if (short_name == NULL)
2388 *var = dict_lookup_var (dict, short_name);
2390 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2395 /* Displays a warning for the current file position, limiting the
2396 number to MAX_TEXT_WARNINGS for TEXT. */
2398 text_warn (struct sfm_reader *r, struct text_record *text,
2399 const char *format, ...)
2401 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2405 va_start (args, format);
2406 sys_msg (r, text->start + text->pos, MW, format, args);
2412 text_get_token (struct text_record *text, struct substring delimiters,
2415 struct substring token;
2418 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2421 end = &ss_data (token)[ss_length (token)];
2422 if (delimiter != NULL)
2425 return ss_data (token);
2428 /* Reads a integer value expressed in decimal, then a space, then a string that
2429 consists of exactly as many bytes as specified by the integer, then a space,
2430 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2431 buffer (so the caller should not free the string). */
2433 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2441 while (text->pos < text->buffer.length)
2443 int c = text->buffer.string[text->pos];
2444 if (c < '0' || c > '9')
2446 n = (n * 10) + (c - '0');
2449 if (text->pos >= text->buffer.length || start == text->pos)
2451 sys_warn (r, text->start,
2452 _("Expecting digit at offset %zu in MRSETS record."),
2457 if (!text_match (text, ' '))
2459 sys_warn (r, text->start,
2460 _("Expecting space at offset %zu in MRSETS record."),
2465 if (text->pos + n > text->buffer.length)
2467 sys_warn (r, text->start,
2468 _("%zu-byte string starting at offset %zu "
2469 "exceeds record length %zu."),
2470 n, text->pos, text->buffer.length);
2474 s = &text->buffer.string[text->pos];
2477 sys_warn (r, text->start,
2478 _("Expecting space at offset %zu following %zu-byte string."),
2488 text_match (struct text_record *text, char c)
2490 if (text->buffer.string[text->pos] == c)
2499 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2500 inside the TEXT's string. */
2502 text_pos (const struct text_record *text)
2509 /* Displays a corruption message. */
2511 sys_msg (struct sfm_reader *r, off_t offset,
2512 int class, const char *format, va_list args)
2517 ds_init_empty (&text);
2519 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2520 fh_get_file_name (r->fh), (long long int) offset);
2522 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2523 ds_put_vformat (&text, format, args);
2525 m.category = msg_class_to_category (class);
2526 m.severity = msg_class_to_severity (class);
2532 m.text = ds_cstr (&text);
2537 /* Displays a warning for offset OFFSET in the file. */
2539 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2543 va_start (args, format);
2544 sys_msg (r, offset, MW, format, args);
2548 /* Displays an error for the current file position,
2549 marks it as in an error state,
2550 and aborts reading it using longjmp. */
2552 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2556 va_start (args, format);
2557 sys_msg (r, offset, ME, format, args);
2561 longjmp (r->bail_out, 1);
2564 /* Reads BYTE_CNT bytes into BUF.
2565 Returns true if exactly BYTE_CNT bytes are successfully read.
2566 Aborts if an I/O error or a partial read occurs.
2567 If EOF_IS_OK, then an immediate end-of-file causes false to be
2568 returned; otherwise, immediate end-of-file causes an abort
2571 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2572 void *buf, size_t byte_cnt)
2574 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2575 r->pos += bytes_read;
2576 if (bytes_read == byte_cnt)
2578 else if (ferror (r->file))
2579 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2580 else if (!eof_is_ok || bytes_read != 0)
2581 sys_error (r, r->pos, _("Unexpected end of file."));
2586 /* Reads BYTE_CNT into BUF.
2587 Aborts upon I/O error or if end-of-file is encountered. */
2589 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2591 read_bytes_internal (r, false, buf, byte_cnt);
2594 /* Reads BYTE_CNT bytes into BUF.
2595 Returns true if exactly BYTE_CNT bytes are successfully read.
2596 Returns false if an immediate end-of-file is encountered.
2597 Aborts if an I/O error or a partial read occurs. */
2599 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2601 return read_bytes_internal (r, true, buf, byte_cnt);
2604 /* Reads a 32-bit signed integer from R and returns its value in
2607 read_int (struct sfm_reader *r)
2610 read_bytes (r, integer, sizeof integer);
2611 return integer_get (r->integer_format, integer, sizeof integer);
2614 /* Reads a 64-bit floating-point number from R and returns its
2615 value in host format. */
2617 read_float (struct sfm_reader *r)
2620 read_bytes (r, number, sizeof number);
2621 return float_get_double (r->float_format, number);
2625 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2627 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2631 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2633 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2636 /* Reads exactly SIZE - 1 bytes into BUFFER
2637 and stores a null byte into BUFFER[SIZE - 1]. */
2639 read_string (struct sfm_reader *r, char *buffer, size_t size)
2642 read_bytes (r, buffer, size - 1);
2643 buffer[size - 1] = '\0';
2646 /* Skips BYTES bytes forward in R. */
2648 skip_bytes (struct sfm_reader *r, size_t bytes)
2653 size_t chunk = MIN (sizeof buffer, bytes);
2654 read_bytes (r, buffer, chunk);
2659 static const struct casereader_class sys_file_casereader_class =
2661 sys_file_casereader_read,
2662 sys_file_casereader_destroy,