1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-strtod.h"
54 #include "gl/c-ctype.h"
55 #include "gl/inttostr.h"
56 #include "gl/localcharset.h"
57 #include "gl/minmax.h"
58 #include "gl/unlocked-io.h"
59 #include "gl/xalloc.h"
63 #define _(msgid) gettext (msgid)
64 #define N_(msgid) (msgid)
68 /* subtypes 0-2 unknown */
69 EXT_INTEGER = 3, /* Machine integer info. */
70 EXT_FLOAT = 4, /* Machine floating-point info. */
71 EXT_VAR_SETS = 5, /* Variable sets. */
72 EXT_DATE = 6, /* DATE. */
73 EXT_MRSETS = 7, /* Multiple response sets. */
74 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
75 /* subtypes 9-10 unknown */
76 EXT_DISPLAY = 11, /* Variable display parameters. */
77 /* subtype 12 unknown */
78 EXT_LONG_NAMES = 13, /* Long variable names. */
79 EXT_LONG_STRINGS = 14, /* Long strings. */
80 /* subtype 15 unknown */
81 EXT_NCASES = 16, /* Extended number of cases. */
82 EXT_FILE_ATTRS = 17, /* Data file attributes. */
83 EXT_VAR_ATTRS = 18, /* Variable attributes. */
84 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
85 EXT_ENCODING = 20, /* Character encoding. */
86 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
87 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
90 /* Fields from the top-level header record. */
91 struct sfm_header_record
93 char magic[5]; /* First 4 bytes of file, then null. */
94 int weight_idx; /* 0 if unweighted, otherwise a var index. */
95 int nominal_case_size; /* Number of var positions. */
97 /* These correspond to the members of struct sfm_file_info or a dictionary
98 but in the system file's encoding rather than ASCII. */
99 char creation_date[10]; /* "dd mmm yy". */
100 char creation_time[9]; /* "hh:mm:ss". */
101 char eye_catcher[61]; /* Eye-catcher string, then product name. */
102 char file_label[65]; /* File label. */
105 struct sfm_var_record
112 int missing_value_code;
115 struct variable *var;
118 struct sfm_value_label
124 struct sfm_value_label_record
127 struct sfm_value_label *labels;
134 struct sfm_document_record
141 struct sfm_extension_record
143 off_t pos; /* Starting offset in file. */
144 size_t size; /* Size of data elements. */
145 size_t count; /* Number of data elements. */
146 void *data; /* Contents. */
149 /* System file reader. */
152 /* Resource tracking. */
153 struct pool *pool; /* All system file state. */
154 jmp_buf bail_out; /* longjmp() target for error handling. */
157 struct file_handle *fh; /* File handle. */
158 struct fh_lock *lock; /* Mutual exclusion for file handle. */
159 FILE *file; /* File stream. */
160 off_t pos; /* Position in file. */
161 bool error; /* I/O or corruption error? */
162 struct caseproto *proto; /* Format of output cases. */
165 enum integer_format integer_format; /* On-disk integer format. */
166 enum float_format float_format; /* On-disk floating point format. */
167 struct sfm_var *sfm_vars; /* Variables. */
168 size_t sfm_var_cnt; /* Number of variables. */
169 casenumber case_cnt; /* Number of cases */
170 const char *encoding; /* String encoding. */
173 bool compressed; /* File is compressed? */
174 double bias; /* Compression bias, usually 100.0. */
175 uint8_t opcodes[8]; /* Current block of opcodes. */
176 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
177 bool corruption_warning; /* Warned about possible corruption? */
180 static const struct casereader_class sys_file_casereader_class;
182 static bool close_reader (struct sfm_reader *);
184 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
185 const struct sfm_var_record *,
188 static void sys_msg (struct sfm_reader *r, off_t, int class,
189 const char *format, va_list args)
190 PRINTF_FORMAT (4, 0);
191 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
192 PRINTF_FORMAT (3, 4);
193 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
197 static void read_bytes (struct sfm_reader *, void *, size_t);
198 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
199 static int read_int (struct sfm_reader *);
200 static double read_float (struct sfm_reader *);
201 static void read_string (struct sfm_reader *, char *, size_t);
202 static void skip_bytes (struct sfm_reader *, size_t);
204 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
205 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
207 static void read_variable_record (struct sfm_reader *,
208 struct sfm_var_record *);
209 static void read_value_label_record (struct sfm_reader *,
210 struct sfm_value_label_record *,
212 static struct sfm_document_record *read_document_record (struct sfm_reader *);
213 static struct sfm_extension_record *read_extension_record (
214 struct sfm_reader *, int subtype);
215 static void skip_extension_record (struct sfm_reader *, int subtype);
217 static const char *choose_encoding (
219 const struct sfm_header_record *,
220 const struct sfm_extension_record *ext_integer,
221 const struct sfm_extension_record *ext_encoding);
223 static struct text_record *open_text_record (
224 struct sfm_reader *, const struct sfm_extension_record *,
225 bool recode_to_utf8);
226 static void close_text_record (struct sfm_reader *,
227 struct text_record *);
228 static bool read_variable_to_value_pair (struct sfm_reader *,
230 struct text_record *,
231 struct variable **var, char **value);
232 static void text_warn (struct sfm_reader *r, struct text_record *text,
233 const char *format, ...)
234 PRINTF_FORMAT (3, 4);
235 static char *text_get_token (struct text_record *,
236 struct substring delimiters, char *delimiter);
237 static bool text_match (struct text_record *, char c);
238 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
239 struct text_record *,
240 struct substring delimiters,
242 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
243 struct text_record *,
244 struct substring delimiters,
246 static const char *text_parse_counted_string (struct sfm_reader *,
247 struct text_record *);
248 static size_t text_pos (const struct text_record *);
250 static bool close_reader (struct sfm_reader *r);
252 /* Dictionary reader. */
260 static void read_header (struct sfm_reader *, struct sfm_read_info *,
261 struct sfm_header_record *);
262 static void parse_header (struct sfm_reader *,
263 const struct sfm_header_record *,
264 struct sfm_read_info *, struct dictionary *);
265 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
266 struct sfm_var_record *, size_t n);
267 static void parse_format_spec (struct sfm_reader *, off_t pos,
268 unsigned int format, enum which_format,
269 struct variable *, int *format_warning_cnt);
270 static void parse_document (struct dictionary *, struct sfm_document_record *);
271 static void parse_display_parameters (struct sfm_reader *,
272 const struct sfm_extension_record *,
273 struct dictionary *);
274 static void parse_machine_integer_info (struct sfm_reader *,
275 const struct sfm_extension_record *,
276 struct sfm_read_info *);
277 static void parse_machine_float_info (struct sfm_reader *,
278 const struct sfm_extension_record *);
279 static void parse_mrsets (struct sfm_reader *,
280 const struct sfm_extension_record *,
281 struct dictionary *);
282 static void parse_long_var_name_map (struct sfm_reader *,
283 const struct sfm_extension_record *,
284 struct dictionary *);
285 static void parse_long_string_map (struct sfm_reader *,
286 const struct sfm_extension_record *,
287 struct dictionary *);
288 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
289 const struct sfm_var_record *,
291 const struct sfm_value_label_record *);
292 static void parse_data_file_attributes (struct sfm_reader *,
293 const struct sfm_extension_record *,
294 struct dictionary *);
295 static void parse_variable_attributes (struct sfm_reader *,
296 const struct sfm_extension_record *,
297 struct dictionary *);
298 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
299 static void parse_long_string_value_labels (struct sfm_reader *,
300 const struct sfm_extension_record *,
301 struct dictionary *);
303 /* Frees the strings inside INFO. */
305 sfm_read_info_destroy (struct sfm_read_info *info)
309 free (info->creation_date);
310 free (info->creation_time);
311 free (info->product);
315 /* Opens the system file designated by file handle FH for reading. Reads the
316 system file's dictionary into *DICT.
318 Ordinarily the reader attempts to automatically detect the character
319 encoding based on the file's contents. This isn't always possible,
320 especially for files written by old versions of SPSS or PSPP, so specifying
321 a nonnull ENCODING overrides the choice of character encoding.
323 If INFO is non-null, then it receives additional info about the system file,
324 which the caller must eventually free with sfm_read_info_destroy() when it
325 is no longer needed. */
327 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
328 struct dictionary **dictp, struct sfm_read_info *infop)
330 struct sfm_reader *volatile r = NULL;
331 struct sfm_read_info *volatile info;
333 struct sfm_header_record header;
335 struct sfm_var_record *vars;
336 size_t n_vars, allocated_vars;
338 struct sfm_value_label_record *labels;
339 size_t n_labels, allocated_labels;
341 struct sfm_document_record *document;
343 struct sfm_extension_record *extensions[32];
345 struct dictionary *volatile dict = NULL;
348 /* Create and initialize reader. */
349 r = pool_create_container (struct sfm_reader, pool);
355 r->opcode_idx = sizeof r->opcodes;
356 r->corruption_warning = false;
358 info = infop ? infop : xmalloc (sizeof *info);
359 memset (info, 0, sizeof *info);
361 /* TRANSLATORS: this fragment will be interpolated into
362 messages in fh_lock() that identify types of files. */
363 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
367 r->file = fn_open (fh_get_file_name (fh), "rb");
370 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
371 fh_get_file_name (r->fh), strerror (errno));
375 if (setjmp (r->bail_out))
379 read_header (r, info, &header);
382 n_vars = allocated_vars = 0;
385 n_labels = allocated_labels = 0;
389 memset (extensions, 0, sizeof extensions);
399 read_int (r); /* Skip filler. */
406 if (n_vars >= allocated_vars)
407 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
409 read_variable_record (r, &vars[n_vars++]);
413 if (n_labels >= allocated_labels)
414 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
416 read_value_label_record (r, &labels[n_labels++], n_vars);
420 /* A Type 4 record is always immediately after a type 3 record,
421 so the code for type 3 records reads the type 4 record too. */
422 sys_error (r, r->pos, _("Misplaced type 4 record."));
425 if (document != NULL)
426 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
427 document = read_document_record (r);
431 subtype = read_int (r);
432 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
435 _("Unrecognized record type 7, subtype %d. Please "
436 "send a copy of this file, and the syntax which "
437 "created it to %s."),
438 subtype, PACKAGE_BUGREPORT);
439 skip_extension_record (r, subtype);
441 else if (extensions[subtype] != NULL)
444 _("Record type 7, subtype %d found here has the same "
445 "type as the record found near offset 0x%llx. "
446 "Please send a copy of this file, and the syntax "
447 "which created it to %s."),
448 subtype, (long long int) extensions[subtype]->pos,
450 skip_extension_record (r, subtype);
453 extensions[subtype] = read_extension_record (r, subtype);
457 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
462 /* Now actually parse what we read.
464 First, figure out the correct character encoding, because this determines
465 how the rest of the header data is to be interpreted. */
466 dict = dict_create (encoding
468 : choose_encoding (r, &header, extensions[EXT_INTEGER],
469 extensions[EXT_ENCODING]));
470 r->encoding = dict_get_encoding (dict);
472 /* These records don't use variables at all. */
473 if (document != NULL)
474 parse_document (dict, document);
476 if (extensions[EXT_INTEGER] != NULL)
477 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
479 if (extensions[EXT_FLOAT] != NULL)
480 parse_machine_float_info (r, extensions[EXT_FLOAT]);
482 if (extensions[EXT_FILE_ATTRS] != NULL)
483 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
485 parse_header (r, &header, info, dict);
487 /* Parse the variable records, the basis of almost everything else. */
488 parse_variable_records (r, dict, vars, n_vars);
490 /* Parse value labels and the weight variable immediately after the variable
491 records. These records use indexes into var_recs[], so we must parse them
492 before those indexes become invalidated by very long string variables. */
493 for (i = 0; i < n_labels; i++)
494 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
495 if (header.weight_idx != 0)
497 struct variable *weight_var;
499 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
501 if (var_is_numeric (weight_var))
502 dict_set_weight (dict, weight_var);
504 sys_error (r, -1, _("Weighting variable must be numeric "
505 "(not string variable `%s')."),
506 var_get_name (weight_var));
509 if (extensions[EXT_DISPLAY] != NULL)
510 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
512 /* The following records use short names, so they need to be parsed before
513 parse_long_var_name_map() changes short names to long names. */
514 if (extensions[EXT_MRSETS] != NULL)
515 parse_mrsets (r, extensions[EXT_MRSETS], dict);
517 if (extensions[EXT_MRSETS2] != NULL)
518 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
520 if (extensions[EXT_LONG_STRINGS] != NULL)
521 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
523 /* Now rename variables to their long names. */
524 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
526 /* The following records use long names, so they need to follow renaming. */
527 if (extensions[EXT_VAR_ATTRS] != NULL)
529 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
531 /* Roles use the $@Role attribute. */
532 assign_variable_roles (r, dict);
535 if (extensions[EXT_LONG_LABELS] != NULL)
536 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
538 /* Warn if the actual amount of data per case differs from the
539 amount that the header claims. SPSS version 13 gets this
540 wrong when very long strings are involved, so don't warn in
542 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
543 && info->version_major != 13)
544 sys_warn (r, -1, _("File header claims %d variable positions but "
545 "%zu were read from file."),
546 header.nominal_case_size, n_vars);
548 /* Create an index of dictionary variable widths for
549 sfm_read_case to use. We cannot use the `struct variable's
550 from the dictionary we created, because the caller owns the
551 dictionary and may destroy or modify its variables. */
552 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
553 pool_register (r->pool, free, r->sfm_vars);
554 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
559 sfm_read_info_destroy (info);
563 return casereader_create_sequential
565 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
566 &sys_file_casereader_class, r);
571 sfm_read_info_destroy (info);
581 /* Closes a system file after we're done with it.
582 Returns true if an I/O error has occurred on READER, false
585 close_reader (struct sfm_reader *r)
594 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
596 msg (ME, _("Error closing system file `%s': %s."),
597 fh_get_file_name (r->fh), strerror (errno));
607 pool_destroy (r->pool);
612 /* Destroys READER. */
614 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
616 struct sfm_reader *r = r_;
620 /* Returns true if FILE is an SPSS system file,
623 sfm_detect (FILE *file)
627 if (fread (magic, 4, 1, file) != 1)
631 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
634 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
635 except for the string fields in *INFO, which parse_header() will initialize
636 later once the file's encoding is known. */
638 read_header (struct sfm_reader *r, struct sfm_read_info *info,
639 struct sfm_header_record *header)
641 uint8_t raw_layout_code[4];
644 read_string (r, header->magic, sizeof header->magic);
645 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
647 if (strcmp (ASCII_MAGIC, header->magic)
648 && strcmp (EBCDIC_MAGIC, header->magic))
649 sys_error (r, 0, _("This is not an SPSS system file."));
651 /* Identify integer format. */
652 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
653 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
655 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
657 || (r->integer_format != INTEGER_MSB_FIRST
658 && r->integer_format != INTEGER_LSB_FIRST))
659 sys_error (r, 64, _("This is not an SPSS system file."));
661 header->nominal_case_size = read_int (r);
662 if (header->nominal_case_size < 0
663 || header->nominal_case_size > INT_MAX / 16)
664 header->nominal_case_size = -1;
666 r->compressed = read_int (r) != 0;
668 header->weight_idx = read_int (r);
670 r->case_cnt = read_int (r);
671 if ( r->case_cnt > INT_MAX / 2)
674 /* Identify floating-point format and obtain compression bias. */
675 read_bytes (r, raw_bias, sizeof raw_bias);
676 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
678 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
680 if (memcmp (raw_bias, zero_bias, 8))
681 sys_warn (r, r->pos - 8,
682 _("Compression bias is not the usual "
683 "value of 100, or system file uses unrecognized "
684 "floating-point format."));
687 /* Some software is known to write all-zeros to this
688 field. Such software also writes floating-point
689 numbers in the format that we expect by default
690 (it seems that all software most likely does, in
691 reality), so don't warn in this case. */
694 if (r->integer_format == INTEGER_MSB_FIRST)
695 r->float_format = FLOAT_IEEE_DOUBLE_BE;
697 r->float_format = FLOAT_IEEE_DOUBLE_LE;
699 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
701 read_string (r, header->creation_date, sizeof header->creation_date);
702 read_string (r, header->creation_time, sizeof header->creation_time);
703 read_string (r, header->file_label, sizeof header->file_label);
706 info->integer_format = r->integer_format;
707 info->float_format = r->float_format;
708 info->compressed = r->compressed;
709 info->case_cnt = r->case_cnt;
712 /* Reads a variable (type 2) record from R into RECORD. */
714 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
716 int has_variable_label;
718 memset (record, 0, sizeof *record);
720 record->pos = r->pos;
721 record->width = read_int (r);
722 has_variable_label = read_int (r);
723 record->missing_value_code = read_int (r);
724 record->print_format = read_int (r);
725 record->write_format = read_int (r);
726 read_bytes (r, record->name, sizeof record->name);
728 if (has_variable_label == 1)
730 enum { MAX_LABEL_LEN = 255 };
731 size_t len, read_len;
735 /* Read up to MAX_LABEL_LEN bytes of label. */
736 read_len = MIN (MAX_LABEL_LEN, len);
737 record->label = pool_malloc (r->pool, read_len + 1);
738 read_string (r, record->label, read_len + 1);
740 /* Skip unread label bytes. */
741 skip_bytes (r, len - read_len);
743 /* Skip label padding up to multiple of 4 bytes. */
744 skip_bytes (r, ROUND_UP (len, 4) - len);
746 else if (has_variable_label != 0)
747 sys_error (r, record->pos,
748 _("Variable label indicator field is not 0 or 1."));
750 /* Set missing values. */
751 if (record->missing_value_code != 0)
753 int code = record->missing_value_code;
754 if (record->width == 0)
756 if (code < -3 || code > 3 || code == -1)
757 sys_error (r, record->pos,
758 _("Numeric missing value indicator field is not "
759 "-3, -2, 0, 1, 2, or 3."));
763 if (code < 1 || code > 3)
764 sys_error (r, record->pos,
765 _("String missing value indicator field is not "
769 read_bytes (r, record->missing, 8 * abs (code));
773 /* Reads value labels from R into RECORD. */
775 read_value_label_record (struct sfm_reader *r,
776 struct sfm_value_label_record *record,
781 /* Read type 3 record. */
782 record->pos = r->pos;
783 record->n_labels = read_int (r);
784 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
785 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
787 record->labels = pool_nmalloc (r->pool, record->n_labels,
788 sizeof *record->labels);
789 for (i = 0; i < record->n_labels; i++)
791 struct sfm_value_label *label = &record->labels[i];
792 unsigned char label_len;
795 read_bytes (r, label->value, sizeof label->value);
797 /* Read label length. */
798 read_bytes (r, &label_len, sizeof label_len);
799 padded_len = ROUND_UP (label_len + 1, 8);
801 /* Read label, padding. */
802 label->label = pool_malloc (r->pool, padded_len + 1);
803 read_bytes (r, label->label, padded_len - 1);
804 label->label[label_len] = '\0';
807 /* Read record type of type 4 record. */
808 if (read_int (r) != 4)
809 sys_error (r, r->pos - 4,
810 _("Variable index record (type 4) does not immediately "
811 "follow value label record (type 3) as it should."));
813 /* Read number of variables associated with value label from type 4
815 record->n_vars = read_int (r);
816 if (record->n_vars < 1 || record->n_vars > n_vars)
817 sys_error (r, r->pos - 4,
818 _("Number of variables associated with a value label (%zu) "
819 "is not between 1 and the number of variables (%zu)."),
820 record->n_vars, n_vars);
821 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
822 for (i = 0; i < record->n_vars; i++)
823 record->vars[i] = read_int (r);
826 /* Reads a document record from R and returns it. */
827 static struct sfm_document_record *
828 read_document_record (struct sfm_reader *r)
830 struct sfm_document_record *record;
833 record = pool_malloc (r->pool, sizeof *record);
834 record->pos = r->pos;
836 n_lines = read_int (r);
837 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
838 sys_error (r, record->pos,
839 _("Number of document lines (%d) "
840 "must be greater than 0 and less than %d."),
841 n_lines, INT_MAX / DOC_LINE_LENGTH);
843 record->n_lines = n_lines;
844 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
845 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
851 read_extension_record_header (struct sfm_reader *r, int subtype,
852 struct sfm_extension_record *record)
854 record->pos = r->pos;
855 record->size = read_int (r);
856 record->count = read_int (r);
858 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
859 allows an extra byte for a null terminator, used by some
860 extension processing routines. */
861 if (record->size != 0
862 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
863 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
866 /* Reads an extension record from R into RECORD. */
867 static struct sfm_extension_record *
868 read_extension_record (struct sfm_reader *r, int subtype)
870 struct extension_record_type
877 static const struct extension_record_type types[] =
879 /* Implemented record types. */
880 { EXT_INTEGER, 4, 8 },
882 { EXT_MRSETS, 1, 0 },
883 { EXT_DISPLAY, 4, 0 },
884 { EXT_LONG_NAMES, 1, 0 },
885 { EXT_LONG_STRINGS, 1, 0 },
886 { EXT_NCASES, 8, 2 },
887 { EXT_FILE_ATTRS, 1, 0 },
888 { EXT_VAR_ATTRS, 1, 0 },
889 { EXT_MRSETS2, 1, 0 },
890 { EXT_ENCODING, 1, 0 },
891 { EXT_LONG_LABELS, 1, 0 },
893 /* Ignored record types. */
894 { EXT_VAR_SETS, 0, 0 },
896 { EXT_DATA_ENTRY, 0, 0 },
897 { EXT_DATAVIEW, 0, 0 },
900 const struct extension_record_type *type;
901 struct sfm_extension_record *record;
904 record = pool_malloc (r->pool, sizeof *record);
905 read_extension_record_header (r, subtype, record);
906 n_bytes = record->count * record->size;
908 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
909 if (subtype == type->subtype)
911 if (type->size > 0 && record->size != type->size)
912 sys_warn (r, record->pos,
913 _("Record type 7, subtype %d has bad size %zu "
914 "(expected %d)."), subtype, record->size, type->size);
915 else if (type->count > 0 && record->count != type->count)
916 sys_warn (r, record->pos,
917 _("Record type 7, subtype %d has bad count %zu "
918 "(expected %d)."), subtype, record->count, type->count);
919 else if (type->count == 0 && type->size == 0)
921 /* Ignore this record. */
925 char *data = pool_malloc (r->pool, n_bytes + 1);
926 data[n_bytes] = '\0';
929 read_bytes (r, record->data, n_bytes);
936 sys_warn (r, record->pos,
937 _("Unrecognized record type 7, subtype %d. Please send a "
938 "copy of this file, and the syntax which created it to %s."),
939 subtype, PACKAGE_BUGREPORT);
942 skip_bytes (r, n_bytes);
947 skip_extension_record (struct sfm_reader *r, int subtype)
949 struct sfm_extension_record record;
951 read_extension_record_header (r, subtype, &record);
952 skip_bytes (r, record.count * record.size);
956 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
957 struct sfm_read_info *info, struct dictionary *dict)
959 const char *dict_encoding = dict_get_encoding (dict);
960 struct substring product;
961 struct substring label;
963 /* Convert file label to UTF-8 and put it into DICT. */
964 label = recode_substring_pool ("UTF-8", dict_encoding,
965 ss_cstr (header->file_label), r->pool);
966 ss_trim (&label, ss_cstr (" "));
967 label.string[label.length] = '\0';
968 dict_set_label (dict, label.string);
970 /* Put creation date and time in UTF-8 into INFO. */
971 info->creation_date = recode_string ("UTF-8", dict_encoding,
972 header->creation_date, -1);
973 info->creation_time = recode_string ("UTF-8", dict_encoding,
974 header->creation_time, -1);
976 /* Put product name into INFO, dropping eye-catcher string if present. */
977 product = recode_substring_pool ("UTF-8", dict_encoding,
978 ss_cstr (header->eye_catcher), r->pool);
979 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
980 ss_trim (&product, ss_cstr (" "));
981 info->product = ss_xstrdup (product);
984 /* Reads a variable (type 2) record from R and adds the
985 corresponding variable to DICT.
986 Also skips past additional variable records for long string
989 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
990 struct sfm_var_record *var_recs, size_t n_var_recs)
992 const char *dict_encoding = dict_get_encoding (dict);
993 struct sfm_var_record *rec;
996 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
998 struct variable *var;
1003 name = recode_string_pool ("UTF-8", dict_encoding,
1004 rec->name, 8, r->pool);
1005 name[strcspn (name, " ")] = '\0';
1007 if (!dict_id_is_valid (dict, name, false)
1008 || name[0] == '$' || name[0] == '#')
1009 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1011 if (rec->width < 0 || rec->width > 255)
1012 sys_error (r, rec->pos,
1013 _("Bad width %d for variable %s."), rec->width, name);
1015 var = rec->var = dict_create_var (dict, name, rec->width);
1017 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
1019 /* Set the short name the same as the long name. */
1020 var_set_short_name (var, 0, name);
1022 /* Get variable label, if any. */
1027 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1028 rec->label, -1, r->pool);
1029 var_set_label (var, utf8_label, false);
1032 /* Set missing values. */
1033 if (rec->missing_value_code != 0)
1035 int width = var_get_width (var);
1036 struct missing_values mv;
1038 mv_init_pool (r->pool, &mv, width);
1039 if (var_is_numeric (var))
1041 bool has_range = rec->missing_value_code < 0;
1042 int n_discrete = (has_range
1043 ? rec->missing_value_code == -3
1044 : rec->missing_value_code);
1049 double low = parse_float (r, rec->missing, 0);
1050 double high = parse_float (r, rec->missing, 8);
1052 /* Deal with SPSS 21 change in representation. */
1056 mv_add_range (&mv, low, high);
1060 for (i = 0; i < n_discrete; i++)
1062 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1070 value_init_pool (r->pool, &value, width);
1071 value_set_missing (&value, width);
1072 for (i = 0; i < rec->missing_value_code; i++)
1074 uint8_t *s = value_str_rw (&value, width);
1075 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1076 mv_add_str (&mv, s);
1079 var_set_missing_values (var, &mv);
1083 parse_format_spec (r, rec->pos + 12, rec->print_format,
1084 PRINT_FORMAT, var, &n_warnings);
1085 parse_format_spec (r, rec->pos + 16, rec->write_format,
1086 WRITE_FORMAT, var, &n_warnings);
1088 /* Account for values.
1089 Skip long string continuation records, if any. */
1090 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1091 for (i = 1; i < n_values; i++)
1092 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1093 sys_error (r, rec->pos, _("Missing string continuation record."));
1098 /* Translates the format spec from sysfile format to internal
1101 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1102 enum which_format which, struct variable *v,
1105 const int max_warnings = 8;
1106 uint8_t raw_type = format >> 16;
1107 uint8_t w = format >> 8;
1116 ok = (fmt_from_io (raw_type, &f.type)
1117 && fmt_check_output (&f)
1118 && fmt_check_width_compat (&f, var_get_width (v)));
1123 if (which == PRINT_FORMAT)
1124 var_set_print_format (v, &f);
1126 var_set_write_format (v, &f);
1128 else if (format == 0)
1130 /* Actually observed in the wild. No point in warning about it. */
1132 else if (++*n_warnings <= max_warnings)
1134 if (which == PRINT_FORMAT)
1135 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1137 var_get_name (v), var_get_width (v), format);
1139 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1141 var_get_name (v), var_get_width (v), format);
1143 if (*n_warnings == max_warnings)
1144 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1149 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1153 for (p = record->documents;
1154 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1155 p += DOC_LINE_LENGTH)
1157 struct substring line;
1159 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1160 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1161 ss_rtrim (&line, ss_cstr (" "));
1162 line.string[line.length] = '\0';
1164 dict_add_document_line (dict, line.string, false);
1170 /* Parses record type 7, subtype 3. */
1172 parse_machine_integer_info (struct sfm_reader *r,
1173 const struct sfm_extension_record *record,
1174 struct sfm_read_info *info)
1176 int float_representation, expected_float_format;
1177 int integer_representation, expected_integer_format;
1179 /* Save version info. */
1180 info->version_major = parse_int (r, record->data, 0);
1181 info->version_minor = parse_int (r, record->data, 4);
1182 info->version_revision = parse_int (r, record->data, 8);
1184 /* Check floating point format. */
1185 float_representation = parse_int (r, record->data, 16);
1186 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1187 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1188 expected_float_format = 1;
1189 else if (r->float_format == FLOAT_Z_LONG)
1190 expected_float_format = 2;
1191 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1192 expected_float_format = 3;
1195 if (float_representation != expected_float_format)
1196 sys_error (r, record->pos, _("Floating-point representation indicated by "
1197 "system file (%d) differs from expected (%d)."),
1198 float_representation, expected_float_format);
1200 /* Check integer format. */
1201 integer_representation = parse_int (r, record->data, 24);
1202 if (r->integer_format == INTEGER_MSB_FIRST)
1203 expected_integer_format = 1;
1204 else if (r->integer_format == INTEGER_LSB_FIRST)
1205 expected_integer_format = 2;
1208 if (integer_representation != expected_integer_format)
1209 sys_warn (r, record->pos,
1210 _("Integer format indicated by system file (%d) "
1211 "differs from expected (%d)."),
1212 integer_representation, expected_integer_format);
1217 choose_encoding (struct sfm_reader *r,
1218 const struct sfm_header_record *header,
1219 const struct sfm_extension_record *ext_integer,
1220 const struct sfm_extension_record *ext_encoding)
1222 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1225 return ext_encoding->data;
1227 /* But EXT_INTEGER is better than nothing as a fallback. */
1230 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1231 const char *encoding;
1240 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1241 respectively. However, there are known to be many files in the wild
1242 with character code 2, yet have data which are clearly not ASCII.
1243 Therefore we ignore these values. */
1250 encoding = sys_get_encoding_from_codepage (codepage);
1251 if (encoding != NULL)
1257 /* If the file magic number is EBCDIC then its character data is too. */
1258 if (!strcmp (header->magic, EBCDIC_MAGIC))
1261 return locale_charset ();
1264 /* Parses record type 7, subtype 4. */
1266 parse_machine_float_info (struct sfm_reader *r,
1267 const struct sfm_extension_record *record)
1269 double sysmis = parse_float (r, record->data, 0);
1270 double highest = parse_float (r, record->data, 8);
1271 double lowest = parse_float (r, record->data, 16);
1273 if (sysmis != SYSMIS)
1274 sys_warn (r, record->pos,
1275 _("File specifies unexpected value %g (%a) as %s, "
1276 "instead of %g (%a)."),
1277 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1279 if (highest != HIGHEST)
1280 sys_warn (r, record->pos,
1281 _("File specifies unexpected value %g (%a) as %s, "
1282 "instead of %g (%a)."),
1283 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1285 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1286 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1287 appears in a context (missing values) where SYSMIS cannot. */
1288 if (lowest != LOWEST && lowest != SYSMIS)
1289 sys_warn (r, record->pos,
1290 _("File specifies unexpected value %g (%a) as %s, "
1291 "instead of %g (%a) or %g (%a)."),
1292 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1295 /* Parses record type 7, subtype 7 or 19. */
1297 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1298 struct dictionary *dict)
1300 struct text_record *text;
1301 struct mrset *mrset;
1303 text = open_text_record (r, record, false);
1306 const char *counted = NULL;
1309 struct stringi_set var_names;
1310 size_t allocated_vars;
1314 mrset = xzalloc (sizeof *mrset);
1316 name = text_get_token (text, ss_cstr ("="), NULL);
1319 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1321 if (mrset->name[0] != '$')
1323 sys_warn (r, record->pos,
1324 _("`%s' does not begin with `$' at offset %zu "
1325 "in MRSETS record."), mrset->name, text_pos (text));
1329 if (text_match (text, 'C'))
1331 mrset->type = MRSET_MC;
1332 if (!text_match (text, ' '))
1334 sys_warn (r, record->pos,
1335 _("Missing space following `%c' at offset %zu "
1336 "in MRSETS record."), 'C', text_pos (text));
1340 else if (text_match (text, 'D'))
1342 mrset->type = MRSET_MD;
1343 mrset->cat_source = MRSET_VARLABELS;
1345 else if (text_match (text, 'E'))
1349 mrset->type = MRSET_MD;
1350 mrset->cat_source = MRSET_COUNTEDVALUES;
1351 if (!text_match (text, ' '))
1353 sys_warn (r, record->pos,
1354 _("Missing space following `%c' at offset %zu "
1355 "in MRSETS record."), 'E', text_pos (text));
1359 number = text_get_token (text, ss_cstr (" "), NULL);
1360 if (!strcmp (number, "11"))
1361 mrset->label_from_var_label = true;
1362 else if (strcmp (number, "1"))
1363 sys_warn (r, record->pos,
1364 _("Unexpected label source value `%s' following `E' "
1365 "at offset %zu in MRSETS record."),
1366 number, text_pos (text));
1370 sys_warn (r, record->pos,
1371 _("Missing `C', `D', or `E' at offset %zu "
1372 "in MRSETS record."),
1377 if (mrset->type == MRSET_MD)
1379 counted = text_parse_counted_string (r, text);
1380 if (counted == NULL)
1384 label = text_parse_counted_string (r, text);
1387 if (label[0] != '\0')
1388 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1390 stringi_set_init (&var_names);
1395 const char *raw_var_name;
1396 struct variable *var;
1399 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1400 if (raw_var_name == NULL)
1402 sys_warn (r, record->pos,
1403 _("Missing new-line parsing variable names "
1404 "at offset %zu in MRSETS record."),
1408 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1410 var = dict_lookup_var (dict, var_name);
1416 if (!stringi_set_insert (&var_names, var_name))
1418 sys_warn (r, record->pos,
1419 _("Duplicate variable name %s "
1420 "at offset %zu in MRSETS record."),
1421 var_name, text_pos (text));
1427 if (mrset->label == NULL && mrset->label_from_var_label
1428 && var_has_label (var))
1429 mrset->label = xstrdup (var_get_label (var));
1432 && var_get_type (var) != var_get_type (mrset->vars[0]))
1434 sys_warn (r, record->pos,
1435 _("MRSET %s contains both string and "
1436 "numeric variables."), name);
1439 width = MIN (width, var_get_width (var));
1441 if (mrset->n_vars >= allocated_vars)
1442 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1443 sizeof *mrset->vars);
1444 mrset->vars[mrset->n_vars++] = var;
1446 while (delimiter != '\n');
1448 if (mrset->n_vars < 2)
1450 sys_warn (r, record->pos,
1451 _("MRSET %s has only %zu variables."), mrset->name,
1453 mrset_destroy (mrset);
1454 stringi_set_destroy (&var_names);
1458 if (mrset->type == MRSET_MD)
1460 mrset->width = width;
1461 value_init (&mrset->counted, width);
1463 mrset->counted.f = c_strtod (counted, NULL);
1465 value_copy_str_rpad (&mrset->counted, width,
1466 (const uint8_t *) counted, ' ');
1469 dict_add_mrset (dict, mrset);
1471 stringi_set_destroy (&var_names);
1473 mrset_destroy (mrset);
1474 close_text_record (r, text);
1477 /* Read record type 7, subtype 11, which specifies how variables
1478 should be displayed in GUI environments. */
1480 parse_display_parameters (struct sfm_reader *r,
1481 const struct sfm_extension_record *record,
1482 struct dictionary *dict)
1484 bool includes_width;
1485 bool warned = false;
1490 n_vars = dict_get_var_cnt (dict);
1491 if (record->count == 3 * n_vars)
1492 includes_width = true;
1493 else if (record->count == 2 * n_vars)
1494 includes_width = false;
1497 sys_warn (r, record->pos,
1498 _("Extension 11 has bad count %zu (for %zu variables)."),
1499 record->count, n_vars);
1504 for (i = 0; i < n_vars; ++i)
1506 struct variable *v = dict_get_var (dict, i);
1507 int measure, width, align;
1509 measure = parse_int (r, record->data, ofs);
1514 width = parse_int (r, record->data, ofs);
1520 align = parse_int (r, record->data, ofs);
1523 /* SPSS sometimes seems to set variables' measure to zero. */
1527 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1530 sys_warn (r, record->pos,
1531 _("Invalid variable display parameters for variable "
1532 "%zu (%s). Default parameters substituted."),
1533 i, var_get_name (v));
1538 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1539 : measure == 2 ? MEASURE_ORDINAL
1541 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1542 : align == 1 ? ALIGN_RIGHT
1545 /* Older versions (SPSS 9.0) sometimes set the display
1546 width to zero. This causes confusion in the GUI, so
1547 only set the width if it is nonzero. */
1549 var_set_display_width (v, width);
1554 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1555 const char *new_name)
1557 size_t n_short_names;
1561 /* Renaming a variable may clear its short names, but we
1562 want to retain them, so we save them and re-set them
1564 n_short_names = var_get_short_name_cnt (var);
1565 short_names = xnmalloc (n_short_names, sizeof *short_names);
1566 for (i = 0; i < n_short_names; i++)
1568 const char *s = var_get_short_name (var, i);
1569 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1572 /* Set long name. */
1573 dict_rename_var (dict, var, new_name);
1575 /* Restore short names. */
1576 for (i = 0; i < n_short_names; i++)
1578 var_set_short_name (var, i, short_names[i]);
1579 free (short_names[i]);
1584 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1585 to each short name. Modifies variable names in DICT accordingly. */
1587 parse_long_var_name_map (struct sfm_reader *r,
1588 const struct sfm_extension_record *record,
1589 struct dictionary *dict)
1591 struct text_record *text;
1592 struct variable *var;
1597 /* There are no long variable names. Use the short variable names,
1598 converted to lowercase, as the long variable names. */
1601 for (i = 0; i < dict_get_var_cnt (dict); i++)
1603 struct variable *var = dict_get_var (dict, i);
1606 new_name = utf8_to_lower (var_get_name (var));
1607 rename_var_and_save_short_names (dict, var, new_name);
1614 /* Rename each of the variables, one by one. (In a correctly constructed
1615 system file, this cannot create any intermediate duplicate variable names,
1616 because all of the new variable names are longer than any of the old
1617 variable names and thus there cannot be any overlaps.) */
1618 text = open_text_record (r, record, true);
1619 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1621 /* Validate long name. */
1622 if (!dict_id_is_valid (dict, long_name, false))
1624 sys_warn (r, record->pos,
1625 _("Long variable mapping from %s to invalid "
1626 "variable name `%s'."),
1627 var_get_name (var), long_name);
1631 /* Identify any duplicates. */
1632 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1633 && dict_lookup_var (dict, long_name) != NULL)
1635 sys_warn (r, record->pos,
1636 _("Duplicate long variable name `%s'."), long_name);
1640 rename_var_and_save_short_names (dict, var, long_name);
1642 close_text_record (r, text);
1645 /* Reads record type 7, subtype 14, which gives the real length
1646 of each very long string. Rearranges DICT accordingly. */
1648 parse_long_string_map (struct sfm_reader *r,
1649 const struct sfm_extension_record *record,
1650 struct dictionary *dict)
1652 struct text_record *text;
1653 struct variable *var;
1656 text = open_text_record (r, record, true);
1657 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1659 size_t idx = var_get_dict_index (var);
1665 length = strtol (length_s, NULL, 10);
1666 if (length < 1 || length > MAX_STRING)
1668 sys_warn (r, record->pos,
1669 _("%s listed as string of invalid length %s "
1670 "in very long string record."),
1671 var_get_name (var), length_s);
1675 /* Check segments. */
1676 segment_cnt = sfm_width_to_segments (length);
1677 if (segment_cnt == 1)
1679 sys_warn (r, record->pos,
1680 _("%s listed in very long string record with width %s, "
1681 "which requires only one segment."),
1682 var_get_name (var), length_s);
1685 if (idx + segment_cnt > dict_get_var_cnt (dict))
1686 sys_error (r, record->pos,
1687 _("Very long string %s overflows dictionary."),
1688 var_get_name (var));
1690 /* Get the short names from the segments and check their
1692 for (i = 0; i < segment_cnt; i++)
1694 struct variable *seg = dict_get_var (dict, idx + i);
1695 int alloc_width = sfm_segment_alloc_width (length, i);
1696 int width = var_get_width (seg);
1699 var_set_short_name (var, i, var_get_short_name (seg, 0));
1700 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1701 sys_error (r, record->pos,
1702 _("Very long string with width %ld has segment %d "
1703 "of width %d (expected %d)."),
1704 length, i, width, alloc_width);
1706 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1707 var_set_width (var, length);
1709 close_text_record (r, text);
1710 dict_compact_values (dict);
1714 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1715 const struct sfm_var_record *var_recs, size_t n_var_recs,
1716 const struct sfm_value_label_record *record)
1718 struct variable **vars;
1722 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1723 for (i = 0; i < record->n_labels; i++)
1724 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1725 record->labels[i].label, -1,
1728 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1729 for (i = 0; i < record->n_vars; i++)
1730 vars[i] = lookup_var_by_index (r, record->pos,
1731 var_recs, n_var_recs, record->vars[i]);
1733 for (i = 1; i < record->n_vars; i++)
1734 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1735 sys_error (r, record->pos,
1736 _("Variables associated with value label are not all of "
1737 "identical type. Variable %s is %s, but variable "
1739 var_get_name (vars[0]),
1740 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1741 var_get_name (vars[i]),
1742 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1744 for (i = 0; i < record->n_vars; i++)
1746 struct variable *var = vars[i];
1750 width = var_get_width (var);
1752 sys_error (r, record->pos,
1753 _("Value labels may not be added to long string "
1754 "variables (e.g. %s) using records types 3 and 4."),
1755 var_get_name (var));
1757 for (j = 0; j < record->n_labels; j++)
1759 struct sfm_value_label *label = &record->labels[j];
1762 value_init (&value, width);
1764 value.f = parse_float (r, label->value, 0);
1766 memcpy (value_str_rw (&value, width), label->value, width);
1768 if (!var_add_value_label (var, &value, utf8_labels[j]))
1770 if (var_is_numeric (var))
1771 sys_warn (r, record->pos,
1772 _("Duplicate value label for %g on %s."),
1773 value.f, var_get_name (var));
1775 sys_warn (r, record->pos,
1776 _("Duplicate value label for `%.*s' on %s."),
1777 width, value_str (&value, width),
1778 var_get_name (var));
1781 value_destroy (&value, width);
1785 pool_free (r->pool, vars);
1786 for (i = 0; i < record->n_labels; i++)
1787 pool_free (r->pool, utf8_labels[i]);
1788 pool_free (r->pool, utf8_labels);
1791 static struct variable *
1792 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1793 const struct sfm_var_record *var_recs, size_t n_var_recs,
1796 const struct sfm_var_record *rec;
1798 if (idx < 1 || idx > n_var_recs)
1800 sys_error (r, offset,
1801 _("Variable index %d not in valid range 1...%zu."),
1806 rec = &var_recs[idx - 1];
1807 if (rec->var == NULL)
1809 sys_error (r, offset,
1810 _("Variable index %d refers to long string continuation."),
1818 /* Parses a set of custom attributes from TEXT into ATTRS.
1819 ATTRS may be a null pointer, in which case the attributes are
1820 read but discarded. */
1822 parse_attributes (struct sfm_reader *r, struct text_record *text,
1823 struct attrset *attrs)
1827 struct attribute *attr;
1831 /* Parse the key. */
1832 key = text_get_token (text, ss_cstr ("("), NULL);
1836 attr = attribute_create (key);
1837 for (index = 1; ; index++)
1839 /* Parse the value. */
1843 value = text_get_token (text, ss_cstr ("\n"), NULL);
1846 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1851 length = strlen (value);
1852 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1854 value[length - 1] = '\0';
1855 attribute_add_value (attr, value + 1);
1860 _("Attribute value %s[%d] is not quoted: %s."),
1862 attribute_add_value (attr, value);
1865 /* Was this the last value for this attribute? */
1866 if (text_match (text, ')'))
1870 attrset_add (attrs, attr);
1872 attribute_destroy (attr);
1874 while (!text_match (text, '/'));
1877 /* Reads record type 7, subtype 17, which lists custom
1878 attributes on the data file. */
1880 parse_data_file_attributes (struct sfm_reader *r,
1881 const struct sfm_extension_record *record,
1882 struct dictionary *dict)
1884 struct text_record *text = open_text_record (r, record, true);
1885 parse_attributes (r, text, dict_get_attributes (dict));
1886 close_text_record (r, text);
1889 /* Parses record type 7, subtype 18, which lists custom
1890 attributes on individual variables. */
1892 parse_variable_attributes (struct sfm_reader *r,
1893 const struct sfm_extension_record *record,
1894 struct dictionary *dict)
1896 struct text_record *text;
1897 struct variable *var;
1899 text = open_text_record (r, record, true);
1900 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1901 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1902 close_text_record (r, text);
1906 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
1908 size_t n_warnings = 0;
1911 for (i = 0; i < dict_get_var_cnt (dict); i++)
1913 struct variable *var = dict_get_var (dict, i);
1914 struct attrset *attrs = var_get_attributes (var);
1915 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
1918 int value = atoi (attribute_get_value (attr, 0));
1940 role = ROLE_PARTITION;
1949 if (n_warnings++ == 0)
1950 sys_warn (r, 0, _("Invalid role for variable %s."),
1951 var_get_name (var));
1954 var_set_role (var, role);
1959 sys_warn (r, 0, _("%zu other variables had invalid roles."),
1964 check_overflow (struct sfm_reader *r,
1965 const struct sfm_extension_record *record,
1966 size_t ofs, size_t length)
1968 size_t end = record->size * record->count;
1969 if (length >= end || ofs + length > end)
1970 sys_error (r, record->pos + end,
1971 _("Long string value label record ends unexpectedly."));
1975 parse_long_string_value_labels (struct sfm_reader *r,
1976 const struct sfm_extension_record *record,
1977 struct dictionary *dict)
1979 const char *dict_encoding = dict_get_encoding (dict);
1980 size_t end = record->size * record->count;
1987 struct variable *var;
1992 /* Parse variable name length. */
1993 check_overflow (r, record, ofs, 4);
1994 var_name_len = parse_int (r, record->data, ofs);
1997 /* Parse variable name, width, and number of labels. */
1998 check_overflow (r, record, ofs, var_name_len + 8);
1999 var_name = recode_string_pool ("UTF-8", dict_encoding,
2000 (const char *) record->data + ofs,
2001 var_name_len, r->pool);
2002 width = parse_int (r, record->data, ofs + var_name_len);
2003 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2004 ofs += var_name_len + 8;
2006 /* Look up 'var' and validate. */
2007 var = dict_lookup_var (dict, var_name);
2009 sys_warn (r, record->pos + ofs,
2010 _("Ignoring long string value record for "
2011 "unknown variable %s."), var_name);
2012 else if (var_is_numeric (var))
2014 sys_warn (r, record->pos + ofs,
2015 _("Ignoring long string value record for "
2016 "numeric variable %s."), var_name);
2019 else if (width != var_get_width (var))
2021 sys_warn (r, record->pos + ofs,
2022 _("Ignoring long string value record for variable %s "
2023 "because the record's width (%d) does not match the "
2024 "variable's width (%d)."),
2025 var_name, width, var_get_width (var));
2030 value_init_pool (r->pool, &value, width);
2031 for (i = 0; i < n_labels; i++)
2033 size_t value_length, label_length;
2034 bool skip = var == NULL;
2036 /* Parse value length. */
2037 check_overflow (r, record, ofs, 4);
2038 value_length = parse_int (r, record->data, ofs);
2042 check_overflow (r, record, ofs, value_length);
2045 if (value_length == width)
2046 memcpy (value_str_rw (&value, width),
2047 (const uint8_t *) record->data + ofs, width);
2050 sys_warn (r, record->pos + ofs,
2051 _("Ignoring long string value %zu for variable "
2052 "%s, with width %d, that has bad value "
2054 i, var_get_name (var), width, value_length);
2058 ofs += value_length;
2060 /* Parse label length. */
2061 check_overflow (r, record, ofs, 4);
2062 label_length = parse_int (r, record->data, ofs);
2066 check_overflow (r, record, ofs, label_length);
2071 label = recode_string_pool ("UTF-8", dict_encoding,
2072 (const char *) record->data + ofs,
2073 label_length, r->pool);
2074 if (!var_add_value_label (var, &value, label))
2075 sys_warn (r, record->pos + ofs,
2076 _("Duplicate value label for `%.*s' on %s."),
2077 width, value_str (&value, width),
2078 var_get_name (var));
2079 pool_free (r->pool, label);
2081 ofs += label_length;
2088 static void partial_record (struct sfm_reader *r)
2091 static void read_error (struct casereader *, const struct sfm_reader *);
2093 static bool read_case_number (struct sfm_reader *, double *);
2094 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2095 static int read_opcode (struct sfm_reader *);
2096 static bool read_compressed_number (struct sfm_reader *, double *);
2097 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2098 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2099 static bool skip_whole_strings (struct sfm_reader *, size_t);
2101 /* Reads and returns one case from READER's file. Returns a null
2102 pointer if not successful. */
2103 static struct ccase *
2104 sys_file_casereader_read (struct casereader *reader, void *r_)
2106 struct sfm_reader *r = r_;
2107 struct ccase *volatile c;
2113 c = case_create (r->proto);
2114 if (setjmp (r->bail_out))
2116 casereader_force_error (reader);
2121 for (i = 0; i < r->sfm_var_cnt; i++)
2123 struct sfm_var *sv = &r->sfm_vars[i];
2124 union value *v = case_data_rw_idx (c, sv->case_index);
2126 if (sv->var_width == 0)
2128 if (!read_case_number (r, &v->f))
2133 uint8_t *s = value_str_rw (v, sv->var_width);
2134 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2136 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2145 if (r->case_cnt != -1)
2146 read_error (reader, r);
2151 /* Issues an error that R ends in a partial record. */
2153 partial_record (struct sfm_reader *r)
2155 sys_error (r, r->pos, _("File ends in partial case."));
2158 /* Issues an error that an unspecified error occurred SFM, and
2161 read_error (struct casereader *r, const struct sfm_reader *sfm)
2163 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2164 casereader_force_error (r);
2167 /* Reads a number from R and stores its value in *D.
2168 If R is compressed, reads a compressed number;
2169 otherwise, reads a number in the regular way.
2170 Returns true if successful, false if end of file is
2171 reached immediately. */
2173 read_case_number (struct sfm_reader *r, double *d)
2178 if (!try_read_bytes (r, number, sizeof number))
2180 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2184 return read_compressed_number (r, d);
2187 /* Reads LENGTH string bytes from R into S.
2188 Always reads a multiple of 8 bytes; if LENGTH is not a
2189 multiple of 8, then extra bytes are read and discarded without
2191 Reads compressed strings if S is compressed.
2192 Returns true if successful, false if end of file is
2193 reached immediately. */
2195 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2197 size_t whole = ROUND_DOWN (length, 8);
2198 size_t partial = length % 8;
2202 if (!read_whole_strings (r, s, whole))
2209 if (!read_whole_strings (r, bounce, sizeof bounce))
2215 memcpy (s + whole, bounce, partial);
2221 /* Reads and returns the next compression opcode from R. */
2223 read_opcode (struct sfm_reader *r)
2225 assert (r->compressed);
2229 if (r->opcode_idx >= sizeof r->opcodes)
2231 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2235 opcode = r->opcodes[r->opcode_idx++];
2242 /* Reads a compressed number from R and stores its value in D.
2243 Returns true if successful, false if end of file is
2244 reached immediately. */
2246 read_compressed_number (struct sfm_reader *r, double *d)
2248 int opcode = read_opcode (r);
2256 *d = read_float (r);
2260 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2261 if (!r->corruption_warning)
2263 r->corruption_warning = true;
2264 sys_warn (r, r->pos,
2265 _("Possible compressed data corruption: "
2266 "compressed spaces appear in numeric field."));
2275 *d = opcode - r->bias;
2282 /* Reads a compressed 8-byte string segment from R and stores it
2284 Returns true if successful, false if end of file is
2285 reached immediately. */
2287 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2289 int opcode = read_opcode (r);
2297 read_bytes (r, dst, 8);
2301 memset (dst, ' ', 8);
2306 double value = opcode - r->bias;
2307 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2310 /* This has actually been seen "in the wild". The submitter of the
2311 file that showed that the contents decoded as spaces, but they
2312 were at the end of the field so it's possible that the null
2313 bytes just acted as null terminators. */
2315 else if (!r->corruption_warning)
2317 r->corruption_warning = true;
2318 sys_warn (r, r->pos,
2319 _("Possible compressed data corruption: "
2320 "string contains compressed integer (opcode %d)."),
2330 /* Reads LENGTH string bytes from R into S.
2331 LENGTH must be a multiple of 8.
2332 Reads compressed strings if S is compressed.
2333 Returns true if successful, false if end of file is
2334 reached immediately. */
2336 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2338 assert (length % 8 == 0);
2340 return try_read_bytes (r, s, length);
2344 for (ofs = 0; ofs < length; ofs += 8)
2345 if (!read_compressed_string (r, s + ofs))
2355 /* Skips LENGTH string bytes from R.
2356 LENGTH must be a multiple of 8.
2357 (LENGTH is also limited to 1024, but that's only because the
2358 current caller never needs more than that many bytes.)
2359 Returns true if successful, false if end of file is
2360 reached immediately. */
2362 skip_whole_strings (struct sfm_reader *r, size_t length)
2364 uint8_t buffer[1024];
2365 assert (length < sizeof buffer);
2366 return read_whole_strings (r, buffer, length);
2369 /* Helpers for reading records that contain structured text
2372 /* Maximum number of warnings to issue for a single text
2374 #define MAX_TEXT_WARNINGS 5
2379 struct substring buffer; /* Record contents. */
2380 off_t start; /* Starting offset in file. */
2381 size_t pos; /* Current position in buffer. */
2382 int n_warnings; /* Number of warnings issued or suppressed. */
2383 bool recoded; /* Recoded into UTF-8? */
2386 static struct text_record *
2387 open_text_record (struct sfm_reader *r,
2388 const struct sfm_extension_record *record,
2389 bool recode_to_utf8)
2391 struct text_record *text;
2392 struct substring raw;
2394 text = pool_alloc (r->pool, sizeof *text);
2395 raw = ss_buffer (record->data, record->size * record->count);
2396 text->start = record->pos;
2397 text->buffer = (recode_to_utf8
2398 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2401 text->n_warnings = 0;
2402 text->recoded = recode_to_utf8;
2407 /* Closes TEXT, frees its storage, and issues a final warning
2408 about suppressed warnings if necesary. */
2410 close_text_record (struct sfm_reader *r, struct text_record *text)
2412 if (text->n_warnings > MAX_TEXT_WARNINGS)
2413 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2414 text->n_warnings - MAX_TEXT_WARNINGS);
2416 pool_free (r->pool, ss_data (text->buffer));
2419 /* Reads a variable=value pair from TEXT.
2420 Looks up the variable in DICT and stores it into *VAR.
2421 Stores a null-terminated value into *VALUE. */
2423 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2424 struct text_record *text,
2425 struct variable **var, char **value)
2429 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2432 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2436 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2437 ss_buffer ("\t\0", 2));
2445 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2446 struct text_record *text, struct substring delimiters,
2447 struct variable **var)
2451 name = text_get_token (text, delimiters, NULL);
2455 *var = dict_lookup_var (dict, name);
2459 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2466 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2467 struct text_record *text, struct substring delimiters,
2468 struct variable **var)
2470 char *short_name = text_get_token (text, delimiters, NULL);
2471 if (short_name == NULL)
2474 *var = dict_lookup_var (dict, short_name);
2476 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2481 /* Displays a warning for the current file position, limiting the
2482 number to MAX_TEXT_WARNINGS for TEXT. */
2484 text_warn (struct sfm_reader *r, struct text_record *text,
2485 const char *format, ...)
2487 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2491 va_start (args, format);
2492 sys_msg (r, text->start + text->pos, MW, format, args);
2498 text_get_token (struct text_record *text, struct substring delimiters,
2501 struct substring token;
2504 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2507 end = &ss_data (token)[ss_length (token)];
2508 if (delimiter != NULL)
2511 return ss_data (token);
2514 /* Reads a integer value expressed in decimal, then a space, then a string that
2515 consists of exactly as many bytes as specified by the integer, then a space,
2516 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2517 buffer (so the caller should not free the string). */
2519 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2527 while (text->pos < text->buffer.length)
2529 int c = text->buffer.string[text->pos];
2530 if (c < '0' || c > '9')
2532 n = (n * 10) + (c - '0');
2535 if (text->pos >= text->buffer.length || start == text->pos)
2537 sys_warn (r, text->start,
2538 _("Expecting digit at offset %zu in MRSETS record."),
2543 if (!text_match (text, ' '))
2545 sys_warn (r, text->start,
2546 _("Expecting space at offset %zu in MRSETS record."),
2551 if (text->pos + n > text->buffer.length)
2553 sys_warn (r, text->start,
2554 _("%zu-byte string starting at offset %zu "
2555 "exceeds record length %zu."),
2556 n, text->pos, text->buffer.length);
2560 s = &text->buffer.string[text->pos];
2563 sys_warn (r, text->start,
2564 _("Expecting space at offset %zu following %zu-byte string."),
2574 text_match (struct text_record *text, char c)
2576 if (text->buffer.string[text->pos] == c)
2585 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2586 inside the TEXT's string. */
2588 text_pos (const struct text_record *text)
2595 /* Displays a corruption message. */
2597 sys_msg (struct sfm_reader *r, off_t offset,
2598 int class, const char *format, va_list args)
2603 ds_init_empty (&text);
2605 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2606 fh_get_file_name (r->fh), (long long int) offset);
2608 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2609 ds_put_vformat (&text, format, args);
2611 m.category = msg_class_to_category (class);
2612 m.severity = msg_class_to_severity (class);
2618 m.text = ds_cstr (&text);
2623 /* Displays a warning for offset OFFSET in the file. */
2625 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2629 va_start (args, format);
2630 sys_msg (r, offset, MW, format, args);
2634 /* Displays an error for the current file position,
2635 marks it as in an error state,
2636 and aborts reading it using longjmp. */
2638 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2642 va_start (args, format);
2643 sys_msg (r, offset, ME, format, args);
2647 longjmp (r->bail_out, 1);
2650 /* Reads BYTE_CNT bytes into BUF.
2651 Returns true if exactly BYTE_CNT bytes are successfully read.
2652 Aborts if an I/O error or a partial read occurs.
2653 If EOF_IS_OK, then an immediate end-of-file causes false to be
2654 returned; otherwise, immediate end-of-file causes an abort
2657 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2658 void *buf, size_t byte_cnt)
2660 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2661 r->pos += bytes_read;
2662 if (bytes_read == byte_cnt)
2664 else if (ferror (r->file))
2665 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2666 else if (!eof_is_ok || bytes_read != 0)
2667 sys_error (r, r->pos, _("Unexpected end of file."));
2672 /* Reads BYTE_CNT into BUF.
2673 Aborts upon I/O error or if end-of-file is encountered. */
2675 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2677 read_bytes_internal (r, false, buf, byte_cnt);
2680 /* Reads BYTE_CNT bytes into BUF.
2681 Returns true if exactly BYTE_CNT bytes are successfully read.
2682 Returns false if an immediate end-of-file is encountered.
2683 Aborts if an I/O error or a partial read occurs. */
2685 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2687 return read_bytes_internal (r, true, buf, byte_cnt);
2690 /* Reads a 32-bit signed integer from R and returns its value in
2693 read_int (struct sfm_reader *r)
2696 read_bytes (r, integer, sizeof integer);
2697 return integer_get (r->integer_format, integer, sizeof integer);
2700 /* Reads a 64-bit floating-point number from R and returns its
2701 value in host format. */
2703 read_float (struct sfm_reader *r)
2706 read_bytes (r, number, sizeof number);
2707 return float_get_double (r->float_format, number);
2711 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2713 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2717 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2719 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2722 /* Reads exactly SIZE - 1 bytes into BUFFER
2723 and stores a null byte into BUFFER[SIZE - 1]. */
2725 read_string (struct sfm_reader *r, char *buffer, size_t size)
2728 read_bytes (r, buffer, size - 1);
2729 buffer[size - 1] = '\0';
2732 /* Skips BYTES bytes forward in R. */
2734 skip_bytes (struct sfm_reader *r, size_t bytes)
2739 size_t chunk = MIN (sizeof buffer, bytes);
2740 read_bytes (r, buffer, chunk);
2745 static const struct casereader_class sys_file_casereader_class =
2747 sys_file_casereader_read,
2748 sys_file_casereader_destroy,