1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-strtod.h"
54 #include "gl/c-ctype.h"
55 #include "gl/inttostr.h"
56 #include "gl/localcharset.h"
57 #include "gl/minmax.h"
58 #include "gl/unlocked-io.h"
59 #include "gl/xalloc.h"
63 #define _(msgid) gettext (msgid)
64 #define N_(msgid) (msgid)
68 /* subtypes 0-2 unknown */
69 EXT_INTEGER = 3, /* Machine integer info. */
70 EXT_FLOAT = 4, /* Machine floating-point info. */
71 EXT_VAR_SETS = 5, /* Variable sets. */
72 EXT_DATE = 6, /* DATE. */
73 EXT_MRSETS = 7, /* Multiple response sets. */
74 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
75 /* subtypes 9-10 unknown */
76 EXT_DISPLAY = 11, /* Variable display parameters. */
77 /* subtype 12 unknown */
78 EXT_LONG_NAMES = 13, /* Long variable names. */
79 EXT_LONG_STRINGS = 14, /* Long strings. */
80 /* subtype 15 unknown */
81 EXT_NCASES = 16, /* Extended number of cases. */
82 EXT_FILE_ATTRS = 17, /* Data file attributes. */
83 EXT_VAR_ATTRS = 18, /* Variable attributes. */
84 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
85 EXT_ENCODING = 20, /* Character encoding. */
86 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
87 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
90 /* Fields from the top-level header record. */
91 struct sfm_header_record
93 char magic[5]; /* First 4 bytes of file, then null. */
94 int weight_idx; /* 0 if unweighted, otherwise a var index. */
95 int nominal_case_size; /* Number of var positions. */
97 /* These correspond to the members of struct sfm_file_info or a dictionary
98 but in the system file's encoding rather than ASCII. */
99 char creation_date[10]; /* "dd mmm yy". */
100 char creation_time[9]; /* "hh:mm:ss". */
101 char eye_catcher[61]; /* Eye-catcher string, then product name. */
102 char file_label[65]; /* File label. */
105 struct sfm_var_record
112 int missing_value_code;
115 struct variable *var;
118 struct sfm_value_label
124 struct sfm_value_label_record
127 struct sfm_value_label *labels;
134 struct sfm_document_record
141 struct sfm_extension_record
143 off_t pos; /* Starting offset in file. */
144 size_t size; /* Size of data elements. */
145 size_t count; /* Number of data elements. */
146 void *data; /* Contents. */
149 /* System file reader. */
152 /* Resource tracking. */
153 struct pool *pool; /* All system file state. */
154 jmp_buf bail_out; /* longjmp() target for error handling. */
157 struct file_handle *fh; /* File handle. */
158 struct fh_lock *lock; /* Mutual exclusion for file handle. */
159 FILE *file; /* File stream. */
160 off_t pos; /* Position in file. */
161 bool error; /* I/O or corruption error? */
162 struct caseproto *proto; /* Format of output cases. */
165 enum integer_format integer_format; /* On-disk integer format. */
166 enum float_format float_format; /* On-disk floating point format. */
167 struct sfm_var *sfm_vars; /* Variables. */
168 size_t sfm_var_cnt; /* Number of variables. */
169 casenumber case_cnt; /* Number of cases */
170 const char *encoding; /* String encoding. */
173 bool compressed; /* File is compressed? */
174 double bias; /* Compression bias, usually 100.0. */
175 uint8_t opcodes[8]; /* Current block of opcodes. */
176 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
177 bool corruption_warning; /* Warned about possible corruption? */
180 static const struct casereader_class sys_file_casereader_class;
182 static bool close_reader (struct sfm_reader *);
184 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
185 const struct sfm_var_record *,
188 static void sys_msg (struct sfm_reader *r, off_t, int class,
189 const char *format, va_list args)
190 PRINTF_FORMAT (4, 0);
191 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
192 PRINTF_FORMAT (3, 4);
193 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
197 static void read_bytes (struct sfm_reader *, void *, size_t);
198 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
199 static int read_int (struct sfm_reader *);
200 static double read_float (struct sfm_reader *);
201 static void read_string (struct sfm_reader *, char *, size_t);
202 static void skip_bytes (struct sfm_reader *, size_t);
204 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
205 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
207 static void read_variable_record (struct sfm_reader *,
208 struct sfm_var_record *);
209 static void read_value_label_record (struct sfm_reader *,
210 struct sfm_value_label_record *,
212 static struct sfm_document_record *read_document_record (struct sfm_reader *);
213 static struct sfm_extension_record *read_extension_record (
214 struct sfm_reader *, int subtype);
215 static void skip_extension_record (struct sfm_reader *, int subtype);
217 static const char *choose_encoding (
219 const struct sfm_header_record *,
220 const struct sfm_extension_record *ext_integer,
221 const struct sfm_extension_record *ext_encoding);
223 static struct text_record *open_text_record (
224 struct sfm_reader *, const struct sfm_extension_record *,
225 bool recode_to_utf8);
226 static void close_text_record (struct sfm_reader *,
227 struct text_record *);
228 static bool read_variable_to_value_pair (struct sfm_reader *,
230 struct text_record *,
231 struct variable **var, char **value);
232 static void text_warn (struct sfm_reader *r, struct text_record *text,
233 const char *format, ...)
234 PRINTF_FORMAT (3, 4);
235 static char *text_get_token (struct text_record *,
236 struct substring delimiters, char *delimiter);
237 static bool text_match (struct text_record *, char c);
238 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
239 struct text_record *,
240 struct substring delimiters,
242 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
243 struct text_record *,
244 struct substring delimiters,
246 static const char *text_parse_counted_string (struct sfm_reader *,
247 struct text_record *);
248 static size_t text_pos (const struct text_record *);
250 static bool close_reader (struct sfm_reader *r);
252 /* Dictionary reader. */
260 static void read_header (struct sfm_reader *, struct sfm_read_info *,
261 struct sfm_header_record *);
262 static void parse_header (struct sfm_reader *,
263 const struct sfm_header_record *,
264 struct sfm_read_info *, struct dictionary *);
265 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
266 struct sfm_var_record *, size_t n);
267 static void parse_format_spec (struct sfm_reader *, off_t pos,
268 unsigned int format, enum which_format,
269 struct variable *, int *format_warning_cnt);
270 static void parse_document (struct dictionary *, struct sfm_document_record *);
271 static void parse_display_parameters (struct sfm_reader *,
272 const struct sfm_extension_record *,
273 struct dictionary *);
274 static void parse_machine_integer_info (struct sfm_reader *,
275 const struct sfm_extension_record *,
276 struct sfm_read_info *);
277 static void parse_machine_float_info (struct sfm_reader *,
278 const struct sfm_extension_record *);
279 static void parse_mrsets (struct sfm_reader *,
280 const struct sfm_extension_record *,
281 struct dictionary *);
282 static void parse_long_var_name_map (struct sfm_reader *,
283 const struct sfm_extension_record *,
284 struct dictionary *);
285 static void parse_long_string_map (struct sfm_reader *,
286 const struct sfm_extension_record *,
287 struct dictionary *);
288 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
289 const struct sfm_var_record *,
291 const struct sfm_value_label_record *);
292 static void parse_data_file_attributes (struct sfm_reader *,
293 const struct sfm_extension_record *,
294 struct dictionary *);
295 static void parse_variable_attributes (struct sfm_reader *,
296 const struct sfm_extension_record *,
297 struct dictionary *);
298 static void parse_long_string_value_labels (struct sfm_reader *,
299 const struct sfm_extension_record *,
300 struct dictionary *);
302 /* Frees the strings inside INFO. */
304 sfm_read_info_destroy (struct sfm_read_info *info)
308 free (info->creation_date);
309 free (info->creation_time);
310 free (info->product);
314 /* Opens the system file designated by file handle FH for reading. Reads the
315 system file's dictionary into *DICT.
317 Ordinarily the reader attempts to automatically detect the character
318 encoding based on the file's contents. This isn't always possible,
319 especially for files written by old versions of SPSS or PSPP, so specifying
320 a nonnull ENCODING overrides the choice of character encoding.
322 If INFO is non-null, then it receives additional info about the system file,
323 which the caller must eventually free with sfm_read_info_destroy() when it
324 is no longer needed. */
326 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
327 struct dictionary **dictp, struct sfm_read_info *infop)
329 struct sfm_reader *volatile r = NULL;
330 struct sfm_read_info *volatile info;
332 struct sfm_header_record header;
334 struct sfm_var_record *vars;
335 size_t n_vars, allocated_vars;
337 struct sfm_value_label_record *labels;
338 size_t n_labels, allocated_labels;
340 struct sfm_document_record *document;
342 struct sfm_extension_record *extensions[32];
344 struct dictionary *volatile dict = NULL;
347 /* Create and initialize reader. */
348 r = pool_create_container (struct sfm_reader, pool);
354 r->opcode_idx = sizeof r->opcodes;
355 r->corruption_warning = false;
357 info = infop ? infop : xmalloc (sizeof *info);
358 memset (info, 0, sizeof *info);
360 /* TRANSLATORS: this fragment will be interpolated into
361 messages in fh_lock() that identify types of files. */
362 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
366 r->file = fn_open (fh_get_file_name (fh), "rb");
369 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
370 fh_get_file_name (r->fh), strerror (errno));
374 if (setjmp (r->bail_out))
378 read_header (r, info, &header);
381 n_vars = allocated_vars = 0;
384 n_labels = allocated_labels = 0;
388 memset (extensions, 0, sizeof extensions);
398 read_int (r); /* Skip filler. */
405 if (n_vars >= allocated_vars)
406 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
408 read_variable_record (r, &vars[n_vars++]);
412 if (n_labels >= allocated_labels)
413 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
415 read_value_label_record (r, &labels[n_labels++], n_vars);
419 /* A Type 4 record is always immediately after a type 3 record,
420 so the code for type 3 records reads the type 4 record too. */
421 sys_error (r, r->pos, _("Misplaced type 4 record."));
424 if (document != NULL)
425 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
426 document = read_document_record (r);
430 subtype = read_int (r);
431 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
434 _("Unrecognized record type 7, subtype %d. Please "
435 "send a copy of this file, and the syntax which "
436 "created it to %s."),
437 subtype, PACKAGE_BUGREPORT);
438 skip_extension_record (r, subtype);
440 else if (extensions[subtype] != NULL)
443 _("Record type 7, subtype %d found here has the same "
444 "type as the record found near offset 0x%llx. "
445 "Please send a copy of this file, and the syntax "
446 "which created it to %s."),
447 subtype, (long long int) extensions[subtype]->pos,
449 skip_extension_record (r, subtype);
452 extensions[subtype] = read_extension_record (r, subtype);
456 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
461 /* Now actually parse what we read.
463 First, figure out the correct character encoding, because this determines
464 how the rest of the header data is to be interpreted. */
465 dict = dict_create (encoding
467 : choose_encoding (r, &header, extensions[EXT_INTEGER],
468 extensions[EXT_ENCODING]));
469 r->encoding = dict_get_encoding (dict);
471 /* These records don't use variables at all. */
472 if (document != NULL)
473 parse_document (dict, document);
475 if (extensions[EXT_INTEGER] != NULL)
476 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
478 if (extensions[EXT_FLOAT] != NULL)
479 parse_machine_float_info (r, extensions[EXT_FLOAT]);
481 if (extensions[EXT_FILE_ATTRS] != NULL)
482 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
484 parse_header (r, &header, info, dict);
486 /* Parse the variable records, the basis of almost everything else. */
487 parse_variable_records (r, dict, vars, n_vars);
489 /* Parse value labels and the weight variable immediately after the variable
490 records. These records use indexes into var_recs[], so we must parse them
491 before those indexes become invalidated by very long string variables. */
492 for (i = 0; i < n_labels; i++)
493 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
494 if (header.weight_idx != 0)
496 struct variable *weight_var;
498 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
500 if (var_is_numeric (weight_var))
501 dict_set_weight (dict, weight_var);
503 sys_error (r, -1, _("Weighting variable must be numeric "
504 "(not string variable `%s')."),
505 var_get_name (weight_var));
508 if (extensions[EXT_DISPLAY] != NULL)
509 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
511 /* The following records use short names, so they need to be parsed before
512 parse_long_var_name_map() changes short names to long names. */
513 if (extensions[EXT_MRSETS] != NULL)
514 parse_mrsets (r, extensions[EXT_MRSETS], dict);
516 if (extensions[EXT_MRSETS2] != NULL)
517 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
519 if (extensions[EXT_LONG_STRINGS] != NULL)
520 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
522 /* Now rename variables to their long names. */
523 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
525 /* The following records use long names, so they need to follow renaming. */
526 if (extensions[EXT_VAR_ATTRS] != NULL)
527 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
529 if (extensions[EXT_LONG_LABELS] != NULL)
530 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
532 /* Warn if the actual amount of data per case differs from the
533 amount that the header claims. SPSS version 13 gets this
534 wrong when very long strings are involved, so don't warn in
536 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
537 && info->version_major != 13)
538 sys_warn (r, -1, _("File header claims %d variable positions but "
539 "%zu were read from file."),
540 header.nominal_case_size, n_vars);
542 /* Create an index of dictionary variable widths for
543 sfm_read_case to use. We cannot use the `struct variable's
544 from the dictionary we created, because the caller owns the
545 dictionary and may destroy or modify its variables. */
546 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
547 pool_register (r->pool, free, r->sfm_vars);
548 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
553 sfm_read_info_destroy (info);
557 return casereader_create_sequential
559 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
560 &sys_file_casereader_class, r);
565 sfm_read_info_destroy (info);
575 /* Closes a system file after we're done with it.
576 Returns true if an I/O error has occurred on READER, false
579 close_reader (struct sfm_reader *r)
588 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
590 msg (ME, _("Error closing system file `%s': %s."),
591 fh_get_file_name (r->fh), strerror (errno));
601 pool_destroy (r->pool);
606 /* Destroys READER. */
608 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
610 struct sfm_reader *r = r_;
614 /* Returns true if FILE is an SPSS system file,
617 sfm_detect (FILE *file)
621 if (fread (magic, 4, 1, file) != 1)
625 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
628 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
629 except for the string fields in *INFO, which parse_header() will initialize
630 later once the file's encoding is known. */
632 read_header (struct sfm_reader *r, struct sfm_read_info *info,
633 struct sfm_header_record *header)
635 uint8_t raw_layout_code[4];
638 read_string (r, header->magic, sizeof header->magic);
639 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
641 if (strcmp (ASCII_MAGIC, header->magic)
642 && strcmp (EBCDIC_MAGIC, header->magic))
643 sys_error (r, 0, _("This is not an SPSS system file."));
645 /* Identify integer format. */
646 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
647 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
649 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
651 || (r->integer_format != INTEGER_MSB_FIRST
652 && r->integer_format != INTEGER_LSB_FIRST))
653 sys_error (r, 64, _("This is not an SPSS system file."));
655 header->nominal_case_size = read_int (r);
656 if (header->nominal_case_size < 0
657 || header->nominal_case_size > INT_MAX / 16)
658 header->nominal_case_size = -1;
660 r->compressed = read_int (r) != 0;
662 header->weight_idx = read_int (r);
664 r->case_cnt = read_int (r);
665 if ( r->case_cnt > INT_MAX / 2)
668 /* Identify floating-point format and obtain compression bias. */
669 read_bytes (r, raw_bias, sizeof raw_bias);
670 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
672 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
674 if (memcmp (raw_bias, zero_bias, 8))
675 sys_warn (r, r->pos - 8,
676 _("Compression bias is not the usual "
677 "value of 100, or system file uses unrecognized "
678 "floating-point format."));
681 /* Some software is known to write all-zeros to this
682 field. Such software also writes floating-point
683 numbers in the format that we expect by default
684 (it seems that all software most likely does, in
685 reality), so don't warn in this case. */
688 if (r->integer_format == INTEGER_MSB_FIRST)
689 r->float_format = FLOAT_IEEE_DOUBLE_BE;
691 r->float_format = FLOAT_IEEE_DOUBLE_LE;
693 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
695 read_string (r, header->creation_date, sizeof header->creation_date);
696 read_string (r, header->creation_time, sizeof header->creation_time);
697 read_string (r, header->file_label, sizeof header->file_label);
700 info->integer_format = r->integer_format;
701 info->float_format = r->float_format;
702 info->compressed = r->compressed;
703 info->case_cnt = r->case_cnt;
706 /* Reads a variable (type 2) record from R into RECORD. */
708 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
710 int has_variable_label;
712 memset (record, 0, sizeof *record);
714 record->pos = r->pos;
715 record->width = read_int (r);
716 has_variable_label = read_int (r);
717 record->missing_value_code = read_int (r);
718 record->print_format = read_int (r);
719 record->write_format = read_int (r);
720 read_bytes (r, record->name, sizeof record->name);
722 if (has_variable_label == 1)
724 enum { MAX_LABEL_LEN = 255 };
725 size_t len, read_len;
729 /* Read up to MAX_LABEL_LEN bytes of label. */
730 read_len = MIN (MAX_LABEL_LEN, len);
731 record->label = pool_malloc (r->pool, read_len + 1);
732 read_string (r, record->label, read_len + 1);
734 /* Skip unread label bytes. */
735 skip_bytes (r, len - read_len);
737 /* Skip label padding up to multiple of 4 bytes. */
738 skip_bytes (r, ROUND_UP (len, 4) - len);
740 else if (has_variable_label != 0)
741 sys_error (r, record->pos,
742 _("Variable label indicator field is not 0 or 1."));
744 /* Set missing values. */
745 if (record->missing_value_code != 0)
747 int code = record->missing_value_code;
748 if (record->width == 0)
750 if (code < -3 || code > 3 || code == -1)
751 sys_error (r, record->pos,
752 _("Numeric missing value indicator field is not "
753 "-3, -2, 0, 1, 2, or 3."));
757 if (code < 1 || code > 3)
758 sys_error (r, record->pos,
759 _("String missing value indicator field is not "
763 read_bytes (r, record->missing, 8 * abs (code));
767 /* Reads value labels from R into RECORD. */
769 read_value_label_record (struct sfm_reader *r,
770 struct sfm_value_label_record *record,
775 /* Read type 3 record. */
776 record->pos = r->pos;
777 record->n_labels = read_int (r);
778 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
779 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
781 record->labels = pool_nmalloc (r->pool, record->n_labels,
782 sizeof *record->labels);
783 for (i = 0; i < record->n_labels; i++)
785 struct sfm_value_label *label = &record->labels[i];
786 unsigned char label_len;
789 read_bytes (r, label->value, sizeof label->value);
791 /* Read label length. */
792 read_bytes (r, &label_len, sizeof label_len);
793 padded_len = ROUND_UP (label_len + 1, 8);
795 /* Read label, padding. */
796 label->label = pool_malloc (r->pool, padded_len + 1);
797 read_bytes (r, label->label, padded_len - 1);
798 label->label[label_len] = '\0';
801 /* Read record type of type 4 record. */
802 if (read_int (r) != 4)
803 sys_error (r, r->pos - 4,
804 _("Variable index record (type 4) does not immediately "
805 "follow value label record (type 3) as it should."));
807 /* Read number of variables associated with value label from type 4
809 record->n_vars = read_int (r);
810 if (record->n_vars < 1 || record->n_vars > n_vars)
811 sys_error (r, r->pos - 4,
812 _("Number of variables associated with a value label (%zu) "
813 "is not between 1 and the number of variables (%zu)."),
814 record->n_vars, n_vars);
815 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
816 for (i = 0; i < record->n_vars; i++)
817 record->vars[i] = read_int (r);
820 /* Reads a document record from R and returns it. */
821 static struct sfm_document_record *
822 read_document_record (struct sfm_reader *r)
824 struct sfm_document_record *record;
827 record = pool_malloc (r->pool, sizeof *record);
828 record->pos = r->pos;
830 n_lines = read_int (r);
831 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
832 sys_error (r, record->pos,
833 _("Number of document lines (%d) "
834 "must be greater than 0 and less than %d."),
835 n_lines, INT_MAX / DOC_LINE_LENGTH);
837 record->n_lines = n_lines;
838 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
839 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
845 read_extension_record_header (struct sfm_reader *r, int subtype,
846 struct sfm_extension_record *record)
848 record->pos = r->pos;
849 record->size = read_int (r);
850 record->count = read_int (r);
852 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
853 allows an extra byte for a null terminator, used by some
854 extension processing routines. */
855 if (record->size != 0
856 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
857 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
860 /* Reads an extension record from R into RECORD. */
861 static struct sfm_extension_record *
862 read_extension_record (struct sfm_reader *r, int subtype)
864 struct extension_record_type
871 static const struct extension_record_type types[] =
873 /* Implemented record types. */
874 { EXT_INTEGER, 4, 8 },
876 { EXT_MRSETS, 1, 0 },
877 { EXT_DISPLAY, 4, 0 },
878 { EXT_LONG_NAMES, 1, 0 },
879 { EXT_LONG_STRINGS, 1, 0 },
880 { EXT_NCASES, 8, 2 },
881 { EXT_FILE_ATTRS, 1, 0 },
882 { EXT_VAR_ATTRS, 1, 0 },
883 { EXT_MRSETS2, 1, 0 },
884 { EXT_ENCODING, 1, 0 },
885 { EXT_LONG_LABELS, 1, 0 },
887 /* Ignored record types. */
888 { EXT_VAR_SETS, 0, 0 },
890 { EXT_DATA_ENTRY, 0, 0 },
891 { EXT_DATAVIEW, 0, 0 },
894 const struct extension_record_type *type;
895 struct sfm_extension_record *record;
898 record = pool_malloc (r->pool, sizeof *record);
899 read_extension_record_header (r, subtype, record);
900 n_bytes = record->count * record->size;
902 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
903 if (subtype == type->subtype)
905 if (type->size > 0 && record->size != type->size)
906 sys_warn (r, record->pos,
907 _("Record type 7, subtype %d has bad size %zu "
908 "(expected %d)."), subtype, record->size, type->size);
909 else if (type->count > 0 && record->count != type->count)
910 sys_warn (r, record->pos,
911 _("Record type 7, subtype %d has bad count %zu "
912 "(expected %d)."), subtype, record->count, type->count);
913 else if (type->count == 0 && type->size == 0)
915 /* Ignore this record. */
919 char *data = pool_malloc (r->pool, n_bytes + 1);
920 data[n_bytes] = '\0';
923 read_bytes (r, record->data, n_bytes);
930 sys_warn (r, record->pos,
931 _("Unrecognized record type 7, subtype %d. Please send a "
932 "copy of this file, and the syntax which created it to %s."),
933 subtype, PACKAGE_BUGREPORT);
936 skip_bytes (r, n_bytes);
941 skip_extension_record (struct sfm_reader *r, int subtype)
943 struct sfm_extension_record record;
945 read_extension_record_header (r, subtype, &record);
946 skip_bytes (r, record.count * record.size);
950 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
951 struct sfm_read_info *info, struct dictionary *dict)
953 const char *dict_encoding = dict_get_encoding (dict);
954 struct substring product;
955 struct substring label;
957 /* Convert file label to UTF-8 and put it into DICT. */
958 label = recode_substring_pool ("UTF-8", dict_encoding,
959 ss_cstr (header->file_label), r->pool);
960 ss_trim (&label, ss_cstr (" "));
961 label.string[label.length] = '\0';
962 dict_set_label (dict, label.string);
964 /* Put creation date and time in UTF-8 into INFO. */
965 info->creation_date = recode_string ("UTF-8", dict_encoding,
966 header->creation_date, -1);
967 info->creation_time = recode_string ("UTF-8", dict_encoding,
968 header->creation_time, -1);
970 /* Put product name into INFO, dropping eye-catcher string if present. */
971 product = recode_substring_pool ("UTF-8", dict_encoding,
972 ss_cstr (header->eye_catcher), r->pool);
973 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
974 ss_trim (&product, ss_cstr (" "));
975 info->product = ss_xstrdup (product);
978 /* Reads a variable (type 2) record from R and adds the
979 corresponding variable to DICT.
980 Also skips past additional variable records for long string
983 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
984 struct sfm_var_record *var_recs, size_t n_var_recs)
986 const char *dict_encoding = dict_get_encoding (dict);
987 struct sfm_var_record *rec;
990 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
992 struct variable *var;
997 name = recode_string_pool ("UTF-8", dict_encoding,
998 rec->name, 8, r->pool);
999 name[strcspn (name, " ")] = '\0';
1001 if (!dict_id_is_valid (dict, name, false)
1002 || name[0] == '$' || name[0] == '#')
1003 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1005 if (rec->width < 0 || rec->width > 255)
1006 sys_error (r, rec->pos,
1007 _("Bad width %d for variable %s."), rec->width, name);
1009 var = rec->var = dict_create_var (dict, name, rec->width);
1011 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
1013 /* Set the short name the same as the long name. */
1014 var_set_short_name (var, 0, name);
1016 /* Get variable label, if any. */
1021 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1022 rec->label, -1, r->pool);
1023 var_set_label (var, utf8_label, false);
1026 /* Set missing values. */
1027 if (rec->missing_value_code != 0)
1029 int width = var_get_width (var);
1030 struct missing_values mv;
1032 mv_init_pool (r->pool, &mv, width);
1033 if (var_is_numeric (var))
1035 bool has_range = rec->missing_value_code < 0;
1036 int n_discrete = (has_range
1037 ? rec->missing_value_code == -3
1038 : rec->missing_value_code);
1043 double low = parse_float (r, rec->missing, 0);
1044 double high = parse_float (r, rec->missing, 8);
1046 /* Deal with SPSS 21 change in representation. */
1050 mv_add_range (&mv, low, high);
1054 for (i = 0; i < n_discrete; i++)
1056 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1064 value_init_pool (r->pool, &value, width);
1065 value_set_missing (&value, width);
1066 for (i = 0; i < rec->missing_value_code; i++)
1068 uint8_t *s = value_str_rw (&value, width);
1069 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1070 mv_add_str (&mv, s);
1073 var_set_missing_values (var, &mv);
1077 parse_format_spec (r, rec->pos + 12, rec->print_format,
1078 PRINT_FORMAT, var, &n_warnings);
1079 parse_format_spec (r, rec->pos + 16, rec->write_format,
1080 WRITE_FORMAT, var, &n_warnings);
1082 /* Account for values.
1083 Skip long string continuation records, if any. */
1084 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1085 for (i = 1; i < n_values; i++)
1086 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1087 sys_error (r, rec->pos, _("Missing string continuation record."));
1092 /* Translates the format spec from sysfile format to internal
1095 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1096 enum which_format which, struct variable *v,
1099 const int max_warnings = 8;
1100 uint8_t raw_type = format >> 16;
1101 uint8_t w = format >> 8;
1110 ok = (fmt_from_io (raw_type, &f.type)
1111 && fmt_check_output (&f)
1112 && fmt_check_width_compat (&f, var_get_width (v)));
1117 if (which == PRINT_FORMAT)
1118 var_set_print_format (v, &f);
1120 var_set_write_format (v, &f);
1122 else if (format == 0)
1124 /* Actually observed in the wild. No point in warning about it. */
1126 else if (++*n_warnings <= max_warnings)
1128 if (which == PRINT_FORMAT)
1129 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1131 var_get_name (v), var_get_width (v), format);
1133 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1135 var_get_name (v), var_get_width (v), format);
1137 if (*n_warnings == max_warnings)
1138 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1143 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1147 for (p = record->documents;
1148 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1149 p += DOC_LINE_LENGTH)
1151 struct substring line;
1153 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1154 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1155 ss_rtrim (&line, ss_cstr (" "));
1156 line.string[line.length] = '\0';
1158 dict_add_document_line (dict, line.string, false);
1164 /* Parses record type 7, subtype 3. */
1166 parse_machine_integer_info (struct sfm_reader *r,
1167 const struct sfm_extension_record *record,
1168 struct sfm_read_info *info)
1170 int float_representation, expected_float_format;
1171 int integer_representation, expected_integer_format;
1173 /* Save version info. */
1174 info->version_major = parse_int (r, record->data, 0);
1175 info->version_minor = parse_int (r, record->data, 4);
1176 info->version_revision = parse_int (r, record->data, 8);
1178 /* Check floating point format. */
1179 float_representation = parse_int (r, record->data, 16);
1180 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1181 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1182 expected_float_format = 1;
1183 else if (r->float_format == FLOAT_Z_LONG)
1184 expected_float_format = 2;
1185 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1186 expected_float_format = 3;
1189 if (float_representation != expected_float_format)
1190 sys_error (r, record->pos, _("Floating-point representation indicated by "
1191 "system file (%d) differs from expected (%d)."),
1192 float_representation, expected_float_format);
1194 /* Check integer format. */
1195 integer_representation = parse_int (r, record->data, 24);
1196 if (r->integer_format == INTEGER_MSB_FIRST)
1197 expected_integer_format = 1;
1198 else if (r->integer_format == INTEGER_LSB_FIRST)
1199 expected_integer_format = 2;
1202 if (integer_representation != expected_integer_format)
1203 sys_warn (r, record->pos,
1204 _("Integer format indicated by system file (%d) "
1205 "differs from expected (%d)."),
1206 integer_representation, expected_integer_format);
1211 choose_encoding (struct sfm_reader *r,
1212 const struct sfm_header_record *header,
1213 const struct sfm_extension_record *ext_integer,
1214 const struct sfm_extension_record *ext_encoding)
1216 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1219 return ext_encoding->data;
1221 /* But EXT_INTEGER is better than nothing as a fallback. */
1224 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1225 const char *encoding;
1234 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1235 respectively. However, there are known to be many files in the wild
1236 with character code 2, yet have data which are clearly not ASCII.
1237 Therefore we ignore these values. */
1244 encoding = sys_get_encoding_from_codepage (codepage);
1245 if (encoding != NULL)
1251 /* If the file magic number is EBCDIC then its character data is too. */
1252 if (!strcmp (header->magic, EBCDIC_MAGIC))
1255 return locale_charset ();
1258 /* Parses record type 7, subtype 4. */
1260 parse_machine_float_info (struct sfm_reader *r,
1261 const struct sfm_extension_record *record)
1263 double sysmis = parse_float (r, record->data, 0);
1264 double highest = parse_float (r, record->data, 8);
1265 double lowest = parse_float (r, record->data, 16);
1267 if (sysmis != SYSMIS)
1268 sys_warn (r, record->pos,
1269 _("File specifies unexpected value %g (%a) as %s, "
1270 "instead of %g (%a)."),
1271 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1273 if (highest != HIGHEST)
1274 sys_warn (r, record->pos,
1275 _("File specifies unexpected value %g (%a) as %s, "
1276 "instead of %g (%a)."),
1277 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1279 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1280 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1281 appears in a context (missing values) where SYSMIS cannot. */
1282 if (lowest != LOWEST && lowest != SYSMIS)
1283 sys_warn (r, record->pos,
1284 _("File specifies unexpected value %g (%a) as %s, "
1285 "instead of %g (%a) or %g (%a)."),
1286 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1289 /* Parses record type 7, subtype 7 or 19. */
1291 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1292 struct dictionary *dict)
1294 struct text_record *text;
1295 struct mrset *mrset;
1297 text = open_text_record (r, record, false);
1300 const char *counted = NULL;
1303 struct stringi_set var_names;
1304 size_t allocated_vars;
1308 mrset = xzalloc (sizeof *mrset);
1310 name = text_get_token (text, ss_cstr ("="), NULL);
1313 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1315 if (mrset->name[0] != '$')
1317 sys_warn (r, record->pos,
1318 _("`%s' does not begin with `$' at offset %zu "
1319 "in MRSETS record."), mrset->name, text_pos (text));
1323 if (text_match (text, 'C'))
1325 mrset->type = MRSET_MC;
1326 if (!text_match (text, ' '))
1328 sys_warn (r, record->pos,
1329 _("Missing space following `%c' at offset %zu "
1330 "in MRSETS record."), 'C', text_pos (text));
1334 else if (text_match (text, 'D'))
1336 mrset->type = MRSET_MD;
1337 mrset->cat_source = MRSET_VARLABELS;
1339 else if (text_match (text, 'E'))
1343 mrset->type = MRSET_MD;
1344 mrset->cat_source = MRSET_COUNTEDVALUES;
1345 if (!text_match (text, ' '))
1347 sys_warn (r, record->pos,
1348 _("Missing space following `%c' at offset %zu "
1349 "in MRSETS record."), 'E', text_pos (text));
1353 number = text_get_token (text, ss_cstr (" "), NULL);
1354 if (!strcmp (number, "11"))
1355 mrset->label_from_var_label = true;
1356 else if (strcmp (number, "1"))
1357 sys_warn (r, record->pos,
1358 _("Unexpected label source value `%s' following `E' "
1359 "at offset %zu in MRSETS record."),
1360 number, text_pos (text));
1364 sys_warn (r, record->pos,
1365 _("Missing `C', `D', or `E' at offset %zu "
1366 "in MRSETS record."),
1371 if (mrset->type == MRSET_MD)
1373 counted = text_parse_counted_string (r, text);
1374 if (counted == NULL)
1378 label = text_parse_counted_string (r, text);
1381 if (label[0] != '\0')
1382 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1384 stringi_set_init (&var_names);
1389 const char *raw_var_name;
1390 struct variable *var;
1393 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1394 if (raw_var_name == NULL)
1396 sys_warn (r, record->pos,
1397 _("Missing new-line parsing variable names "
1398 "at offset %zu in MRSETS record."),
1402 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1404 var = dict_lookup_var (dict, var_name);
1410 if (!stringi_set_insert (&var_names, var_name))
1412 sys_warn (r, record->pos,
1413 _("Duplicate variable name %s "
1414 "at offset %zu in MRSETS record."),
1415 var_name, text_pos (text));
1421 if (mrset->label == NULL && mrset->label_from_var_label
1422 && var_has_label (var))
1423 mrset->label = xstrdup (var_get_label (var));
1426 && var_get_type (var) != var_get_type (mrset->vars[0]))
1428 sys_warn (r, record->pos,
1429 _("MRSET %s contains both string and "
1430 "numeric variables."), name);
1433 width = MIN (width, var_get_width (var));
1435 if (mrset->n_vars >= allocated_vars)
1436 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1437 sizeof *mrset->vars);
1438 mrset->vars[mrset->n_vars++] = var;
1440 while (delimiter != '\n');
1442 if (mrset->n_vars < 2)
1444 sys_warn (r, record->pos,
1445 _("MRSET %s has only %zu variables."), mrset->name,
1447 mrset_destroy (mrset);
1448 stringi_set_destroy (&var_names);
1452 if (mrset->type == MRSET_MD)
1454 mrset->width = width;
1455 value_init (&mrset->counted, width);
1457 mrset->counted.f = c_strtod (counted, NULL);
1459 value_copy_str_rpad (&mrset->counted, width,
1460 (const uint8_t *) counted, ' ');
1463 dict_add_mrset (dict, mrset);
1465 stringi_set_destroy (&var_names);
1467 mrset_destroy (mrset);
1468 close_text_record (r, text);
1471 /* Read record type 7, subtype 11, which specifies how variables
1472 should be displayed in GUI environments. */
1474 parse_display_parameters (struct sfm_reader *r,
1475 const struct sfm_extension_record *record,
1476 struct dictionary *dict)
1478 bool includes_width;
1479 bool warned = false;
1484 n_vars = dict_get_var_cnt (dict);
1485 if (record->count == 3 * n_vars)
1486 includes_width = true;
1487 else if (record->count == 2 * n_vars)
1488 includes_width = false;
1491 sys_warn (r, record->pos,
1492 _("Extension 11 has bad count %zu (for %zu variables)."),
1493 record->count, n_vars);
1498 for (i = 0; i < n_vars; ++i)
1500 struct variable *v = dict_get_var (dict, i);
1501 int measure, width, align;
1503 measure = parse_int (r, record->data, ofs);
1508 width = parse_int (r, record->data, ofs);
1514 align = parse_int (r, record->data, ofs);
1517 /* SPSS 14 sometimes seems to set string variables' measure
1519 if (0 == measure && var_is_alpha (v))
1522 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1525 sys_warn (r, record->pos,
1526 _("Invalid variable display parameters for variable "
1527 "%zu (%s). Default parameters substituted."),
1528 i, var_get_name (v));
1533 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1534 : measure == 2 ? MEASURE_ORDINAL
1536 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1537 : align == 1 ? ALIGN_RIGHT
1540 /* Older versions (SPSS 9.0) sometimes set the display
1541 width to zero. This causes confusion in the GUI, so
1542 only set the width if it is nonzero. */
1544 var_set_display_width (v, width);
1549 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1550 const char *new_name)
1552 size_t n_short_names;
1556 /* Renaming a variable may clear its short names, but we
1557 want to retain them, so we save them and re-set them
1559 n_short_names = var_get_short_name_cnt (var);
1560 short_names = xnmalloc (n_short_names, sizeof *short_names);
1561 for (i = 0; i < n_short_names; i++)
1563 const char *s = var_get_short_name (var, i);
1564 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1567 /* Set long name. */
1568 dict_rename_var (dict, var, new_name);
1570 /* Restore short names. */
1571 for (i = 0; i < n_short_names; i++)
1573 var_set_short_name (var, i, short_names[i]);
1574 free (short_names[i]);
1579 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1580 to each short name. Modifies variable names in DICT accordingly. */
1582 parse_long_var_name_map (struct sfm_reader *r,
1583 const struct sfm_extension_record *record,
1584 struct dictionary *dict)
1586 struct text_record *text;
1587 struct variable *var;
1592 /* There are no long variable names. Use the short variable names,
1593 converted to lowercase, as the long variable names. */
1596 for (i = 0; i < dict_get_var_cnt (dict); i++)
1598 struct variable *var = dict_get_var (dict, i);
1601 new_name = utf8_to_lower (var_get_name (var));
1602 rename_var_and_save_short_names (dict, var, new_name);
1609 /* Rename each of the variables, one by one. (In a correctly constructed
1610 system file, this cannot create any intermediate duplicate variable names,
1611 because all of the new variable names are longer than any of the old
1612 variable names and thus there cannot be any overlaps.) */
1613 text = open_text_record (r, record, true);
1614 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1616 /* Validate long name. */
1617 if (!dict_id_is_valid (dict, long_name, false))
1619 sys_warn (r, record->pos,
1620 _("Long variable mapping from %s to invalid "
1621 "variable name `%s'."),
1622 var_get_name (var), long_name);
1626 /* Identify any duplicates. */
1627 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1628 && dict_lookup_var (dict, long_name) != NULL)
1630 sys_warn (r, record->pos,
1631 _("Duplicate long variable name `%s'."), long_name);
1635 rename_var_and_save_short_names (dict, var, long_name);
1637 close_text_record (r, text);
1640 /* Reads record type 7, subtype 14, which gives the real length
1641 of each very long string. Rearranges DICT accordingly. */
1643 parse_long_string_map (struct sfm_reader *r,
1644 const struct sfm_extension_record *record,
1645 struct dictionary *dict)
1647 struct text_record *text;
1648 struct variable *var;
1651 text = open_text_record (r, record, true);
1652 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1654 size_t idx = var_get_dict_index (var);
1660 length = strtol (length_s, NULL, 10);
1661 if (length < 1 || length > MAX_STRING)
1663 sys_warn (r, record->pos,
1664 _("%s listed as string of invalid length %s "
1665 "in very long string record."),
1666 var_get_name (var), length_s);
1670 /* Check segments. */
1671 segment_cnt = sfm_width_to_segments (length);
1672 if (segment_cnt == 1)
1674 sys_warn (r, record->pos,
1675 _("%s listed in very long string record with width %s, "
1676 "which requires only one segment."),
1677 var_get_name (var), length_s);
1680 if (idx + segment_cnt > dict_get_var_cnt (dict))
1681 sys_error (r, record->pos,
1682 _("Very long string %s overflows dictionary."),
1683 var_get_name (var));
1685 /* Get the short names from the segments and check their
1687 for (i = 0; i < segment_cnt; i++)
1689 struct variable *seg = dict_get_var (dict, idx + i);
1690 int alloc_width = sfm_segment_alloc_width (length, i);
1691 int width = var_get_width (seg);
1694 var_set_short_name (var, i, var_get_short_name (seg, 0));
1695 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1696 sys_error (r, record->pos,
1697 _("Very long string with width %ld has segment %d "
1698 "of width %d (expected %d)."),
1699 length, i, width, alloc_width);
1701 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1702 var_set_width (var, length);
1704 close_text_record (r, text);
1705 dict_compact_values (dict);
1709 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1710 const struct sfm_var_record *var_recs, size_t n_var_recs,
1711 const struct sfm_value_label_record *record)
1713 struct variable **vars;
1717 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1718 for (i = 0; i < record->n_labels; i++)
1719 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1720 record->labels[i].label, -1,
1723 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1724 for (i = 0; i < record->n_vars; i++)
1725 vars[i] = lookup_var_by_index (r, record->pos,
1726 var_recs, n_var_recs, record->vars[i]);
1728 for (i = 1; i < record->n_vars; i++)
1729 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1730 sys_error (r, record->pos,
1731 _("Variables associated with value label are not all of "
1732 "identical type. Variable %s is %s, but variable "
1734 var_get_name (vars[0]),
1735 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1736 var_get_name (vars[i]),
1737 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1739 for (i = 0; i < record->n_vars; i++)
1741 struct variable *var = vars[i];
1745 width = var_get_width (var);
1747 sys_error (r, record->pos,
1748 _("Value labels may not be added to long string "
1749 "variables (e.g. %s) using records types 3 and 4."),
1750 var_get_name (var));
1752 for (j = 0; j < record->n_labels; j++)
1754 struct sfm_value_label *label = &record->labels[j];
1757 value_init (&value, width);
1759 value.f = parse_float (r, label->value, 0);
1761 memcpy (value_str_rw (&value, width), label->value, width);
1763 if (!var_add_value_label (var, &value, utf8_labels[j]))
1765 if (var_is_numeric (var))
1766 sys_warn (r, record->pos,
1767 _("Duplicate value label for %g on %s."),
1768 value.f, var_get_name (var));
1770 sys_warn (r, record->pos,
1771 _("Duplicate value label for `%.*s' on %s."),
1772 width, value_str (&value, width),
1773 var_get_name (var));
1776 value_destroy (&value, width);
1780 pool_free (r->pool, vars);
1781 for (i = 0; i < record->n_labels; i++)
1782 pool_free (r->pool, utf8_labels[i]);
1783 pool_free (r->pool, utf8_labels);
1786 static struct variable *
1787 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1788 const struct sfm_var_record *var_recs, size_t n_var_recs,
1791 const struct sfm_var_record *rec;
1793 if (idx < 1 || idx > n_var_recs)
1795 sys_error (r, offset,
1796 _("Variable index %d not in valid range 1...%zu."),
1801 rec = &var_recs[idx - 1];
1802 if (rec->var == NULL)
1804 sys_error (r, offset,
1805 _("Variable index %d refers to long string continuation."),
1813 /* Parses a set of custom attributes from TEXT into ATTRS.
1814 ATTRS may be a null pointer, in which case the attributes are
1815 read but discarded. */
1817 parse_attributes (struct sfm_reader *r, struct text_record *text,
1818 struct attrset *attrs)
1822 struct attribute *attr;
1826 /* Parse the key. */
1827 key = text_get_token (text, ss_cstr ("("), NULL);
1831 attr = attribute_create (key);
1832 for (index = 1; ; index++)
1834 /* Parse the value. */
1838 value = text_get_token (text, ss_cstr ("\n"), NULL);
1841 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1846 length = strlen (value);
1847 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1849 value[length - 1] = '\0';
1850 attribute_add_value (attr, value + 1);
1855 _("Attribute value %s[%d] is not quoted: %s."),
1857 attribute_add_value (attr, value);
1860 /* Was this the last value for this attribute? */
1861 if (text_match (text, ')'))
1865 attrset_add (attrs, attr);
1867 attribute_destroy (attr);
1869 while (!text_match (text, '/'));
1872 /* Reads record type 7, subtype 17, which lists custom
1873 attributes on the data file. */
1875 parse_data_file_attributes (struct sfm_reader *r,
1876 const struct sfm_extension_record *record,
1877 struct dictionary *dict)
1879 struct text_record *text = open_text_record (r, record, true);
1880 parse_attributes (r, text, dict_get_attributes (dict));
1881 close_text_record (r, text);
1884 /* Parses record type 7, subtype 18, which lists custom
1885 attributes on individual variables. */
1887 parse_variable_attributes (struct sfm_reader *r,
1888 const struct sfm_extension_record *record,
1889 struct dictionary *dict)
1891 struct text_record *text;
1892 struct variable *var;
1894 text = open_text_record (r, record, true);
1895 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1896 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1897 close_text_record (r, text);
1901 check_overflow (struct sfm_reader *r,
1902 const struct sfm_extension_record *record,
1903 size_t ofs, size_t length)
1905 size_t end = record->size * record->count;
1906 if (length >= end || ofs + length > end)
1907 sys_error (r, record->pos + end,
1908 _("Long string value label record ends unexpectedly."));
1912 parse_long_string_value_labels (struct sfm_reader *r,
1913 const struct sfm_extension_record *record,
1914 struct dictionary *dict)
1916 const char *dict_encoding = dict_get_encoding (dict);
1917 size_t end = record->size * record->count;
1924 struct variable *var;
1929 /* Parse variable name length. */
1930 check_overflow (r, record, ofs, 4);
1931 var_name_len = parse_int (r, record->data, ofs);
1934 /* Parse variable name, width, and number of labels. */
1935 check_overflow (r, record, ofs, var_name_len + 8);
1936 var_name = recode_string_pool ("UTF-8", dict_encoding,
1937 (const char *) record->data + ofs,
1938 var_name_len, r->pool);
1939 width = parse_int (r, record->data, ofs + var_name_len);
1940 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1941 ofs += var_name_len + 8;
1943 /* Look up 'var' and validate. */
1944 var = dict_lookup_var (dict, var_name);
1946 sys_warn (r, record->pos + ofs,
1947 _("Ignoring long string value record for "
1948 "unknown variable %s."), var_name);
1949 else if (var_is_numeric (var))
1951 sys_warn (r, record->pos + ofs,
1952 _("Ignoring long string value record for "
1953 "numeric variable %s."), var_name);
1956 else if (width != var_get_width (var))
1958 sys_warn (r, record->pos + ofs,
1959 _("Ignoring long string value record for variable %s "
1960 "because the record's width (%d) does not match the "
1961 "variable's width (%d)."),
1962 var_name, width, var_get_width (var));
1967 value_init_pool (r->pool, &value, width);
1968 for (i = 0; i < n_labels; i++)
1970 size_t value_length, label_length;
1971 bool skip = var == NULL;
1973 /* Parse value length. */
1974 check_overflow (r, record, ofs, 4);
1975 value_length = parse_int (r, record->data, ofs);
1979 check_overflow (r, record, ofs, value_length);
1982 if (value_length == width)
1983 memcpy (value_str_rw (&value, width),
1984 (const uint8_t *) record->data + ofs, width);
1987 sys_warn (r, record->pos + ofs,
1988 _("Ignoring long string value %zu for variable "
1989 "%s, with width %d, that has bad value "
1991 i, var_get_name (var), width, value_length);
1995 ofs += value_length;
1997 /* Parse label length. */
1998 check_overflow (r, record, ofs, 4);
1999 label_length = parse_int (r, record->data, ofs);
2003 check_overflow (r, record, ofs, label_length);
2008 label = recode_string_pool ("UTF-8", dict_encoding,
2009 (const char *) record->data + ofs,
2010 label_length, r->pool);
2011 if (!var_add_value_label (var, &value, label))
2012 sys_warn (r, record->pos + ofs,
2013 _("Duplicate value label for `%.*s' on %s."),
2014 width, value_str (&value, width),
2015 var_get_name (var));
2016 pool_free (r->pool, label);
2018 ofs += label_length;
2025 static void partial_record (struct sfm_reader *r)
2028 static void read_error (struct casereader *, const struct sfm_reader *);
2030 static bool read_case_number (struct sfm_reader *, double *);
2031 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2032 static int read_opcode (struct sfm_reader *);
2033 static bool read_compressed_number (struct sfm_reader *, double *);
2034 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2035 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2036 static bool skip_whole_strings (struct sfm_reader *, size_t);
2038 /* Reads and returns one case from READER's file. Returns a null
2039 pointer if not successful. */
2040 static struct ccase *
2041 sys_file_casereader_read (struct casereader *reader, void *r_)
2043 struct sfm_reader *r = r_;
2044 struct ccase *volatile c;
2050 c = case_create (r->proto);
2051 if (setjmp (r->bail_out))
2053 casereader_force_error (reader);
2058 for (i = 0; i < r->sfm_var_cnt; i++)
2060 struct sfm_var *sv = &r->sfm_vars[i];
2061 union value *v = case_data_rw_idx (c, sv->case_index);
2063 if (sv->var_width == 0)
2065 if (!read_case_number (r, &v->f))
2070 uint8_t *s = value_str_rw (v, sv->var_width);
2071 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2073 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2082 if (r->case_cnt != -1)
2083 read_error (reader, r);
2088 /* Issues an error that R ends in a partial record. */
2090 partial_record (struct sfm_reader *r)
2092 sys_error (r, r->pos, _("File ends in partial case."));
2095 /* Issues an error that an unspecified error occurred SFM, and
2098 read_error (struct casereader *r, const struct sfm_reader *sfm)
2100 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2101 casereader_force_error (r);
2104 /* Reads a number from R and stores its value in *D.
2105 If R is compressed, reads a compressed number;
2106 otherwise, reads a number in the regular way.
2107 Returns true if successful, false if end of file is
2108 reached immediately. */
2110 read_case_number (struct sfm_reader *r, double *d)
2115 if (!try_read_bytes (r, number, sizeof number))
2117 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2121 return read_compressed_number (r, d);
2124 /* Reads LENGTH string bytes from R into S.
2125 Always reads a multiple of 8 bytes; if LENGTH is not a
2126 multiple of 8, then extra bytes are read and discarded without
2128 Reads compressed strings if S is compressed.
2129 Returns true if successful, false if end of file is
2130 reached immediately. */
2132 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2134 size_t whole = ROUND_DOWN (length, 8);
2135 size_t partial = length % 8;
2139 if (!read_whole_strings (r, s, whole))
2146 if (!read_whole_strings (r, bounce, sizeof bounce))
2152 memcpy (s + whole, bounce, partial);
2158 /* Reads and returns the next compression opcode from R. */
2160 read_opcode (struct sfm_reader *r)
2162 assert (r->compressed);
2166 if (r->opcode_idx >= sizeof r->opcodes)
2168 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2172 opcode = r->opcodes[r->opcode_idx++];
2179 /* Reads a compressed number from R and stores its value in D.
2180 Returns true if successful, false if end of file is
2181 reached immediately. */
2183 read_compressed_number (struct sfm_reader *r, double *d)
2185 int opcode = read_opcode (r);
2193 *d = read_float (r);
2197 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2198 if (!r->corruption_warning)
2200 r->corruption_warning = true;
2201 sys_warn (r, r->pos,
2202 _("Possible compressed data corruption: "
2203 "compressed spaces appear in numeric field."));
2212 *d = opcode - r->bias;
2219 /* Reads a compressed 8-byte string segment from R and stores it
2221 Returns true if successful, false if end of file is
2222 reached immediately. */
2224 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2226 int opcode = read_opcode (r);
2234 read_bytes (r, dst, 8);
2238 memset (dst, ' ', 8);
2243 double value = opcode - r->bias;
2244 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2247 /* This has actually been seen "in the wild". The submitter of the
2248 file that showed that the contents decoded as spaces, but they
2249 were at the end of the field so it's possible that the null
2250 bytes just acted as null terminators. */
2252 else if (!r->corruption_warning)
2254 r->corruption_warning = true;
2255 sys_warn (r, r->pos,
2256 _("Possible compressed data corruption: "
2257 "string contains compressed integer (opcode %d)."),
2267 /* Reads LENGTH string bytes from R into S.
2268 LENGTH must be a multiple of 8.
2269 Reads compressed strings if S is compressed.
2270 Returns true if successful, false if end of file is
2271 reached immediately. */
2273 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2275 assert (length % 8 == 0);
2277 return try_read_bytes (r, s, length);
2281 for (ofs = 0; ofs < length; ofs += 8)
2282 if (!read_compressed_string (r, s + ofs))
2292 /* Skips LENGTH string bytes from R.
2293 LENGTH must be a multiple of 8.
2294 (LENGTH is also limited to 1024, but that's only because the
2295 current caller never needs more than that many bytes.)
2296 Returns true if successful, false if end of file is
2297 reached immediately. */
2299 skip_whole_strings (struct sfm_reader *r, size_t length)
2301 uint8_t buffer[1024];
2302 assert (length < sizeof buffer);
2303 return read_whole_strings (r, buffer, length);
2306 /* Helpers for reading records that contain structured text
2309 /* Maximum number of warnings to issue for a single text
2311 #define MAX_TEXT_WARNINGS 5
2316 struct substring buffer; /* Record contents. */
2317 off_t start; /* Starting offset in file. */
2318 size_t pos; /* Current position in buffer. */
2319 int n_warnings; /* Number of warnings issued or suppressed. */
2320 bool recoded; /* Recoded into UTF-8? */
2323 static struct text_record *
2324 open_text_record (struct sfm_reader *r,
2325 const struct sfm_extension_record *record,
2326 bool recode_to_utf8)
2328 struct text_record *text;
2329 struct substring raw;
2331 text = pool_alloc (r->pool, sizeof *text);
2332 raw = ss_buffer (record->data, record->size * record->count);
2333 text->start = record->pos;
2334 text->buffer = (recode_to_utf8
2335 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2338 text->n_warnings = 0;
2339 text->recoded = recode_to_utf8;
2344 /* Closes TEXT, frees its storage, and issues a final warning
2345 about suppressed warnings if necesary. */
2347 close_text_record (struct sfm_reader *r, struct text_record *text)
2349 if (text->n_warnings > MAX_TEXT_WARNINGS)
2350 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2351 text->n_warnings - MAX_TEXT_WARNINGS);
2353 pool_free (r->pool, ss_data (text->buffer));
2356 /* Reads a variable=value pair from TEXT.
2357 Looks up the variable in DICT and stores it into *VAR.
2358 Stores a null-terminated value into *VALUE. */
2360 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2361 struct text_record *text,
2362 struct variable **var, char **value)
2366 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2369 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2373 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2374 ss_buffer ("\t\0", 2));
2382 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2383 struct text_record *text, struct substring delimiters,
2384 struct variable **var)
2388 name = text_get_token (text, delimiters, NULL);
2392 *var = dict_lookup_var (dict, name);
2396 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2403 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2404 struct text_record *text, struct substring delimiters,
2405 struct variable **var)
2407 char *short_name = text_get_token (text, delimiters, NULL);
2408 if (short_name == NULL)
2411 *var = dict_lookup_var (dict, short_name);
2413 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2418 /* Displays a warning for the current file position, limiting the
2419 number to MAX_TEXT_WARNINGS for TEXT. */
2421 text_warn (struct sfm_reader *r, struct text_record *text,
2422 const char *format, ...)
2424 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2428 va_start (args, format);
2429 sys_msg (r, text->start + text->pos, MW, format, args);
2435 text_get_token (struct text_record *text, struct substring delimiters,
2438 struct substring token;
2441 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2444 end = &ss_data (token)[ss_length (token)];
2445 if (delimiter != NULL)
2448 return ss_data (token);
2451 /* Reads a integer value expressed in decimal, then a space, then a string that
2452 consists of exactly as many bytes as specified by the integer, then a space,
2453 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2454 buffer (so the caller should not free the string). */
2456 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2464 while (text->pos < text->buffer.length)
2466 int c = text->buffer.string[text->pos];
2467 if (c < '0' || c > '9')
2469 n = (n * 10) + (c - '0');
2472 if (text->pos >= text->buffer.length || start == text->pos)
2474 sys_warn (r, text->start,
2475 _("Expecting digit at offset %zu in MRSETS record."),
2480 if (!text_match (text, ' '))
2482 sys_warn (r, text->start,
2483 _("Expecting space at offset %zu in MRSETS record."),
2488 if (text->pos + n > text->buffer.length)
2490 sys_warn (r, text->start,
2491 _("%zu-byte string starting at offset %zu "
2492 "exceeds record length %zu."),
2493 n, text->pos, text->buffer.length);
2497 s = &text->buffer.string[text->pos];
2500 sys_warn (r, text->start,
2501 _("Expecting space at offset %zu following %zu-byte string."),
2511 text_match (struct text_record *text, char c)
2513 if (text->buffer.string[text->pos] == c)
2522 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2523 inside the TEXT's string. */
2525 text_pos (const struct text_record *text)
2532 /* Displays a corruption message. */
2534 sys_msg (struct sfm_reader *r, off_t offset,
2535 int class, const char *format, va_list args)
2540 ds_init_empty (&text);
2542 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2543 fh_get_file_name (r->fh), (long long int) offset);
2545 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2546 ds_put_vformat (&text, format, args);
2548 m.category = msg_class_to_category (class);
2549 m.severity = msg_class_to_severity (class);
2555 m.text = ds_cstr (&text);
2560 /* Displays a warning for offset OFFSET in the file. */
2562 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2566 va_start (args, format);
2567 sys_msg (r, offset, MW, format, args);
2571 /* Displays an error for the current file position,
2572 marks it as in an error state,
2573 and aborts reading it using longjmp. */
2575 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2579 va_start (args, format);
2580 sys_msg (r, offset, ME, format, args);
2584 longjmp (r->bail_out, 1);
2587 /* Reads BYTE_CNT bytes into BUF.
2588 Returns true if exactly BYTE_CNT bytes are successfully read.
2589 Aborts if an I/O error or a partial read occurs.
2590 If EOF_IS_OK, then an immediate end-of-file causes false to be
2591 returned; otherwise, immediate end-of-file causes an abort
2594 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2595 void *buf, size_t byte_cnt)
2597 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2598 r->pos += bytes_read;
2599 if (bytes_read == byte_cnt)
2601 else if (ferror (r->file))
2602 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2603 else if (!eof_is_ok || bytes_read != 0)
2604 sys_error (r, r->pos, _("Unexpected end of file."));
2609 /* Reads BYTE_CNT into BUF.
2610 Aborts upon I/O error or if end-of-file is encountered. */
2612 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2614 read_bytes_internal (r, false, buf, byte_cnt);
2617 /* Reads BYTE_CNT bytes into BUF.
2618 Returns true if exactly BYTE_CNT bytes are successfully read.
2619 Returns false if an immediate end-of-file is encountered.
2620 Aborts if an I/O error or a partial read occurs. */
2622 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2624 return read_bytes_internal (r, true, buf, byte_cnt);
2627 /* Reads a 32-bit signed integer from R and returns its value in
2630 read_int (struct sfm_reader *r)
2633 read_bytes (r, integer, sizeof integer);
2634 return integer_get (r->integer_format, integer, sizeof integer);
2637 /* Reads a 64-bit floating-point number from R and returns its
2638 value in host format. */
2640 read_float (struct sfm_reader *r)
2643 read_bytes (r, number, sizeof number);
2644 return float_get_double (r->float_format, number);
2648 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2650 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2654 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2656 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2659 /* Reads exactly SIZE - 1 bytes into BUFFER
2660 and stores a null byte into BUFFER[SIZE - 1]. */
2662 read_string (struct sfm_reader *r, char *buffer, size_t size)
2665 read_bytes (r, buffer, size - 1);
2666 buffer[size - 1] = '\0';
2669 /* Skips BYTES bytes forward in R. */
2671 skip_bytes (struct sfm_reader *r, size_t bytes)
2676 size_t chunk = MIN (sizeof buffer, bytes);
2677 read_bytes (r, buffer, chunk);
2682 static const struct casereader_class sys_file_casereader_class =
2684 sys_file_casereader_read,
2685 sys_file_casereader_destroy,