1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-strtod.h"
54 #include "gl/c-ctype.h"
55 #include "gl/inttostr.h"
56 #include "gl/localcharset.h"
57 #include "gl/minmax.h"
58 #include "gl/unlocked-io.h"
59 #include "gl/xalloc.h"
63 #define _(msgid) gettext (msgid)
64 #define N_(msgid) (msgid)
68 /* subtypes 0-2 unknown */
69 EXT_INTEGER = 3, /* Machine integer info. */
70 EXT_FLOAT = 4, /* Machine floating-point info. */
71 EXT_VAR_SETS = 5, /* Variable sets. */
72 EXT_DATE = 6, /* DATE. */
73 EXT_MRSETS = 7, /* Multiple response sets. */
74 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
75 /* subtypes 9-10 unknown */
76 EXT_DISPLAY = 11, /* Variable display parameters. */
77 /* subtype 12 unknown */
78 EXT_LONG_NAMES = 13, /* Long variable names. */
79 EXT_LONG_STRINGS = 14, /* Long strings. */
80 /* subtype 15 unknown */
81 EXT_NCASES = 16, /* Extended number of cases. */
82 EXT_FILE_ATTRS = 17, /* Data file attributes. */
83 EXT_VAR_ATTRS = 18, /* Variable attributes. */
84 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
85 EXT_ENCODING = 20, /* Character encoding. */
86 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
89 /* Fields from the top-level header record. */
90 struct sfm_header_record
92 char magic[5]; /* First 4 bytes of file, then null. */
93 int weight_idx; /* 0 if unweighted, otherwise a var index. */
94 int nominal_case_size; /* Number of var positions. */
96 /* These correspond to the members of struct sfm_file_info or a dictionary
97 but in the system file's encoding rather than ASCII. */
98 char creation_date[10]; /* "dd mmm yy". */
99 char creation_time[9]; /* "hh:mm:ss". */
100 char eye_catcher[61]; /* Eye-catcher string, then product name. */
101 char file_label[65]; /* File label. */
104 struct sfm_var_record
111 int missing_value_code;
114 struct variable *var;
117 struct sfm_value_label
123 struct sfm_value_label_record
126 struct sfm_value_label *labels;
133 struct sfm_document_record
140 struct sfm_extension_record
142 off_t pos; /* Starting offset in file. */
143 size_t size; /* Size of data elements. */
144 size_t count; /* Number of data elements. */
145 void *data; /* Contents. */
148 /* System file reader. */
151 /* Resource tracking. */
152 struct pool *pool; /* All system file state. */
153 jmp_buf bail_out; /* longjmp() target for error handling. */
156 struct file_handle *fh; /* File handle. */
157 struct fh_lock *lock; /* Mutual exclusion for file handle. */
158 FILE *file; /* File stream. */
159 off_t pos; /* Position in file. */
160 bool error; /* I/O or corruption error? */
161 struct caseproto *proto; /* Format of output cases. */
164 enum integer_format integer_format; /* On-disk integer format. */
165 enum float_format float_format; /* On-disk floating point format. */
166 struct sfm_var *sfm_vars; /* Variables. */
167 size_t sfm_var_cnt; /* Number of variables. */
168 casenumber case_cnt; /* Number of cases */
169 const char *encoding; /* String encoding. */
172 bool compressed; /* File is compressed? */
173 double bias; /* Compression bias, usually 100.0. */
174 uint8_t opcodes[8]; /* Current block of opcodes. */
175 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
176 bool corruption_warning; /* Warned about possible corruption? */
179 static const struct casereader_class sys_file_casereader_class;
181 static bool close_reader (struct sfm_reader *);
183 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
184 const struct sfm_var_record *,
187 static void sys_msg (struct sfm_reader *r, off_t, int class,
188 const char *format, va_list args)
189 PRINTF_FORMAT (4, 0);
190 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
191 PRINTF_FORMAT (3, 4);
192 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
196 static void read_bytes (struct sfm_reader *, void *, size_t);
197 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
198 static int read_int (struct sfm_reader *);
199 static double read_float (struct sfm_reader *);
200 static void read_string (struct sfm_reader *, char *, size_t);
201 static void skip_bytes (struct sfm_reader *, size_t);
203 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
204 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
206 static void read_variable_record (struct sfm_reader *,
207 struct sfm_var_record *);
208 static void read_value_label_record (struct sfm_reader *,
209 struct sfm_value_label_record *,
211 static struct sfm_document_record *read_document_record (struct sfm_reader *);
212 static struct sfm_extension_record *read_extension_record (
213 struct sfm_reader *, int subtype);
214 static void skip_extension_record (struct sfm_reader *, int subtype);
216 static const char *choose_encoding (
218 const struct sfm_header_record *,
219 const struct sfm_extension_record *ext_integer,
220 const struct sfm_extension_record *ext_encoding);
222 static struct text_record *open_text_record (
223 struct sfm_reader *, const struct sfm_extension_record *,
224 bool recode_to_utf8);
225 static void close_text_record (struct sfm_reader *,
226 struct text_record *);
227 static bool read_variable_to_value_pair (struct sfm_reader *,
229 struct text_record *,
230 struct variable **var, char **value);
231 static void text_warn (struct sfm_reader *r, struct text_record *text,
232 const char *format, ...)
233 PRINTF_FORMAT (3, 4);
234 static char *text_get_token (struct text_record *,
235 struct substring delimiters, char *delimiter);
236 static bool text_match (struct text_record *, char c);
237 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
238 struct text_record *,
239 struct substring delimiters,
241 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
242 struct text_record *,
243 struct substring delimiters,
245 static const char *text_parse_counted_string (struct sfm_reader *,
246 struct text_record *);
247 static size_t text_pos (const struct text_record *);
249 static bool close_reader (struct sfm_reader *r);
251 /* Dictionary reader. */
259 static void read_header (struct sfm_reader *, struct sfm_read_info *,
260 struct sfm_header_record *);
261 static void parse_header (struct sfm_reader *,
262 const struct sfm_header_record *,
263 struct sfm_read_info *, struct dictionary *);
264 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
265 struct sfm_var_record *, size_t n);
266 static void parse_format_spec (struct sfm_reader *, off_t pos,
267 unsigned int format, enum which_format,
268 struct variable *, int *format_warning_cnt);
269 static void parse_document (struct dictionary *, struct sfm_document_record *);
270 static void parse_display_parameters (struct sfm_reader *,
271 const struct sfm_extension_record *,
272 struct dictionary *);
273 static void parse_machine_integer_info (struct sfm_reader *,
274 const struct sfm_extension_record *,
275 struct sfm_read_info *);
276 static void parse_machine_float_info (struct sfm_reader *,
277 const struct sfm_extension_record *);
278 static void parse_mrsets (struct sfm_reader *,
279 const struct sfm_extension_record *,
280 struct dictionary *);
281 static void parse_long_var_name_map (struct sfm_reader *,
282 const struct sfm_extension_record *,
283 struct dictionary *);
284 static void parse_long_string_map (struct sfm_reader *,
285 const struct sfm_extension_record *,
286 struct dictionary *);
287 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
288 const struct sfm_var_record *,
290 const struct sfm_value_label_record *);
291 static void parse_data_file_attributes (struct sfm_reader *,
292 const struct sfm_extension_record *,
293 struct dictionary *);
294 static void parse_variable_attributes (struct sfm_reader *,
295 const struct sfm_extension_record *,
296 struct dictionary *);
297 static void parse_long_string_value_labels (struct sfm_reader *,
298 const struct sfm_extension_record *,
299 struct dictionary *);
301 /* Frees the strings inside INFO. */
303 sfm_read_info_destroy (struct sfm_read_info *info)
307 free (info->creation_date);
308 free (info->creation_time);
309 free (info->product);
313 /* Opens the system file designated by file handle FH for reading. Reads the
314 system file's dictionary into *DICT.
316 Ordinarily the reader attempts to automatically detect the character
317 encoding based on the file's contents. This isn't always possible,
318 especially for files written by old versions of SPSS or PSPP, so specifying
319 a nonnull ENCODING overrides the choice of character encoding.
321 If INFO is non-null, then it receives additional info about the system file,
322 which the caller must eventually free with sfm_read_info_destroy() when it
323 is no longer needed. */
325 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
326 struct dictionary **dictp, struct sfm_read_info *infop)
328 struct sfm_reader *volatile r = NULL;
329 struct sfm_read_info *volatile info;
331 struct sfm_header_record header;
333 struct sfm_var_record *vars;
334 size_t n_vars, allocated_vars;
336 struct sfm_value_label_record *labels;
337 size_t n_labels, allocated_labels;
339 struct sfm_document_record *document;
341 struct sfm_extension_record *extensions[32];
343 struct dictionary *volatile dict = NULL;
346 /* Create and initialize reader. */
347 r = pool_create_container (struct sfm_reader, pool);
353 r->opcode_idx = sizeof r->opcodes;
354 r->corruption_warning = false;
356 info = infop ? infop : xmalloc (sizeof *info);
357 memset (info, 0, sizeof *info);
359 /* TRANSLATORS: this fragment will be interpolated into
360 messages in fh_lock() that identify types of files. */
361 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
365 r->file = fn_open (fh_get_file_name (fh), "rb");
368 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
369 fh_get_file_name (r->fh), strerror (errno));
373 if (setjmp (r->bail_out))
377 read_header (r, info, &header);
380 n_vars = allocated_vars = 0;
383 n_labels = allocated_labels = 0;
387 memset (extensions, 0, sizeof extensions);
397 read_int (r); /* Skip filler. */
404 if (n_vars >= allocated_vars)
405 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
407 read_variable_record (r, &vars[n_vars++]);
411 if (n_labels >= allocated_labels)
412 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
414 read_value_label_record (r, &labels[n_labels++], n_vars);
418 /* A Type 4 record is always immediately after a type 3 record,
419 so the code for type 3 records reads the type 4 record too. */
420 sys_error (r, r->pos, _("Misplaced type 4 record."));
423 if (document != NULL)
424 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
425 document = read_document_record (r);
429 subtype = read_int (r);
430 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
433 _("Unrecognized record type 7, subtype %d. Please "
434 "send a copy of this file, and the syntax which "
435 "created it to %s."),
436 subtype, PACKAGE_BUGREPORT);
437 skip_extension_record (r, subtype);
439 else if (extensions[subtype] != NULL)
442 _("Record type 7, subtype %d found here has the same "
443 "type as the record found near offset 0x%llx. "
444 "Please send a copy of this file, and the syntax "
445 "which created it to %s."),
446 subtype, (long long int) extensions[subtype]->pos,
448 skip_extension_record (r, subtype);
451 extensions[subtype] = read_extension_record (r, subtype);
455 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
460 /* Now actually parse what we read.
462 First, figure out the correct character encoding, because this determines
463 how the rest of the header data is to be interpreted. */
464 dict = dict_create (encoding
466 : choose_encoding (r, &header, extensions[EXT_INTEGER],
467 extensions[EXT_ENCODING]));
468 r->encoding = dict_get_encoding (dict);
470 /* These records don't use variables at all. */
471 if (document != NULL)
472 parse_document (dict, document);
474 if (extensions[EXT_INTEGER] != NULL)
475 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
477 if (extensions[EXT_FLOAT] != NULL)
478 parse_machine_float_info (r, extensions[EXT_FLOAT]);
480 if (extensions[EXT_FILE_ATTRS] != NULL)
481 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
483 parse_header (r, &header, info, dict);
485 /* Parse the variable records, the basis of almost everything else. */
486 parse_variable_records (r, dict, vars, n_vars);
488 /* Parse value labels and the weight variable immediately after the variable
489 records. These records use indexes into var_recs[], so we must parse them
490 before those indexes become invalidated by very long string variables. */
491 for (i = 0; i < n_labels; i++)
492 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
493 if (header.weight_idx != 0)
495 struct variable *weight_var;
497 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
499 if (var_is_numeric (weight_var))
500 dict_set_weight (dict, weight_var);
502 sys_error (r, -1, _("Weighting variable must be numeric "
503 "(not string variable `%s')."),
504 var_get_name (weight_var));
507 if (extensions[EXT_DISPLAY] != NULL)
508 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
510 /* The following records use short names, so they need to be parsed before
511 parse_long_var_name_map() changes short names to long names. */
512 if (extensions[EXT_MRSETS] != NULL)
513 parse_mrsets (r, extensions[EXT_MRSETS], dict);
515 if (extensions[EXT_MRSETS2] != NULL)
516 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
518 if (extensions[EXT_LONG_STRINGS] != NULL)
519 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
521 /* Now rename variables to their long names. */
522 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
524 /* The following records use long names, so they need to follow renaming. */
525 if (extensions[EXT_VAR_ATTRS] != NULL)
526 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
528 if (extensions[EXT_LONG_LABELS] != NULL)
529 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
531 /* Warn if the actual amount of data per case differs from the
532 amount that the header claims. SPSS version 13 gets this
533 wrong when very long strings are involved, so don't warn in
535 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
536 && info->version_major != 13)
537 sys_warn (r, -1, _("File header claims %d variable positions but "
538 "%zu were read from file."),
539 header.nominal_case_size, n_vars);
541 /* Create an index of dictionary variable widths for
542 sfm_read_case to use. We cannot use the `struct variable's
543 from the dictionary we created, because the caller owns the
544 dictionary and may destroy or modify its variables. */
545 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
546 pool_register (r->pool, free, r->sfm_vars);
547 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
552 sfm_read_info_destroy (info);
556 return casereader_create_sequential
558 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
559 &sys_file_casereader_class, r);
564 sfm_read_info_destroy (info);
574 /* Closes a system file after we're done with it.
575 Returns true if an I/O error has occurred on READER, false
578 close_reader (struct sfm_reader *r)
587 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
589 msg (ME, _("Error closing system file `%s': %s."),
590 fh_get_file_name (r->fh), strerror (errno));
600 pool_destroy (r->pool);
605 /* Destroys READER. */
607 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
609 struct sfm_reader *r = r_;
613 /* Returns true if FILE is an SPSS system file,
616 sfm_detect (FILE *file)
620 if (fread (magic, 4, 1, file) != 1)
624 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
627 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
628 except for the string fields in *INFO, which parse_header() will initialize
629 later once the file's encoding is known. */
631 read_header (struct sfm_reader *r, struct sfm_read_info *info,
632 struct sfm_header_record *header)
634 uint8_t raw_layout_code[4];
637 read_string (r, header->magic, sizeof header->magic);
638 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
640 if (strcmp (ASCII_MAGIC, header->magic)
641 && strcmp (EBCDIC_MAGIC, header->magic))
642 sys_error (r, 0, _("This is not an SPSS system file."));
644 /* Identify integer format. */
645 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
646 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
648 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
650 || (r->integer_format != INTEGER_MSB_FIRST
651 && r->integer_format != INTEGER_LSB_FIRST))
652 sys_error (r, 64, _("This is not an SPSS system file."));
654 header->nominal_case_size = read_int (r);
655 if (header->nominal_case_size < 0
656 || header->nominal_case_size > INT_MAX / 16)
657 header->nominal_case_size = -1;
659 r->compressed = read_int (r) != 0;
661 header->weight_idx = read_int (r);
663 r->case_cnt = read_int (r);
664 if ( r->case_cnt > INT_MAX / 2)
667 /* Identify floating-point format and obtain compression bias. */
668 read_bytes (r, raw_bias, sizeof raw_bias);
669 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
671 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
673 if (memcmp (raw_bias, zero_bias, 8))
674 sys_warn (r, r->pos - 8,
675 _("Compression bias is not the usual "
676 "value of 100, or system file uses unrecognized "
677 "floating-point format."));
680 /* Some software is known to write all-zeros to this
681 field. Such software also writes floating-point
682 numbers in the format that we expect by default
683 (it seems that all software most likely does, in
684 reality), so don't warn in this case. */
687 if (r->integer_format == INTEGER_MSB_FIRST)
688 r->float_format = FLOAT_IEEE_DOUBLE_BE;
690 r->float_format = FLOAT_IEEE_DOUBLE_LE;
692 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
694 read_string (r, header->creation_date, sizeof header->creation_date);
695 read_string (r, header->creation_time, sizeof header->creation_time);
696 read_string (r, header->file_label, sizeof header->file_label);
699 info->integer_format = r->integer_format;
700 info->float_format = r->float_format;
701 info->compressed = r->compressed;
702 info->case_cnt = r->case_cnt;
705 /* Reads a variable (type 2) record from R into RECORD. */
707 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
709 int has_variable_label;
711 memset (record, 0, sizeof *record);
713 record->pos = r->pos;
714 record->width = read_int (r);
715 has_variable_label = read_int (r);
716 record->missing_value_code = read_int (r);
717 record->print_format = read_int (r);
718 record->write_format = read_int (r);
719 read_bytes (r, record->name, sizeof record->name);
721 if (has_variable_label == 1)
723 enum { MAX_LABEL_LEN = 255 };
724 size_t len, read_len;
728 /* Read up to MAX_LABEL_LEN bytes of label. */
729 read_len = MIN (MAX_LABEL_LEN, len);
730 record->label = pool_malloc (r->pool, read_len + 1);
731 read_string (r, record->label, read_len + 1);
733 /* Skip unread label bytes. */
734 skip_bytes (r, len - read_len);
736 /* Skip label padding up to multiple of 4 bytes. */
737 skip_bytes (r, ROUND_UP (len, 4) - len);
739 else if (has_variable_label != 0)
740 sys_error (r, record->pos,
741 _("Variable label indicator field is not 0 or 1."));
743 /* Set missing values. */
744 if (record->missing_value_code != 0)
746 int code = record->missing_value_code;
747 if (record->width == 0)
749 if (code < -3 || code > 3 || code == -1)
750 sys_error (r, record->pos,
751 _("Numeric missing value indicator field is not "
752 "-3, -2, 0, 1, 2, or 3."));
756 if (code < 1 || code > 3)
757 sys_error (r, record->pos,
758 _("String missing value indicator field is not "
762 read_bytes (r, record->missing, 8 * abs (code));
766 /* Reads value labels from R into RECORD. */
768 read_value_label_record (struct sfm_reader *r,
769 struct sfm_value_label_record *record,
774 /* Read type 3 record. */
775 record->pos = r->pos;
776 record->n_labels = read_int (r);
777 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
778 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
780 record->labels = pool_nmalloc (r->pool, record->n_labels,
781 sizeof *record->labels);
782 for (i = 0; i < record->n_labels; i++)
784 struct sfm_value_label *label = &record->labels[i];
785 unsigned char label_len;
788 read_bytes (r, label->value, sizeof label->value);
790 /* Read label length. */
791 read_bytes (r, &label_len, sizeof label_len);
792 padded_len = ROUND_UP (label_len + 1, 8);
794 /* Read label, padding. */
795 label->label = pool_malloc (r->pool, padded_len + 1);
796 read_bytes (r, label->label, padded_len - 1);
797 label->label[label_len] = '\0';
800 /* Read record type of type 4 record. */
801 if (read_int (r) != 4)
802 sys_error (r, r->pos - 4,
803 _("Variable index record (type 4) does not immediately "
804 "follow value label record (type 3) as it should."));
806 /* Read number of variables associated with value label from type 4
808 record->n_vars = read_int (r);
809 if (record->n_vars < 1 || record->n_vars > n_vars)
810 sys_error (r, r->pos - 4,
811 _("Number of variables associated with a value label (%zu) "
812 "is not between 1 and the number of variables (%zu)."),
813 record->n_vars, n_vars);
814 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
815 for (i = 0; i < record->n_vars; i++)
816 record->vars[i] = read_int (r);
819 /* Reads a document record from R and returns it. */
820 static struct sfm_document_record *
821 read_document_record (struct sfm_reader *r)
823 struct sfm_document_record *record;
826 record = pool_malloc (r->pool, sizeof *record);
827 record->pos = r->pos;
829 n_lines = read_int (r);
830 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
831 sys_error (r, record->pos,
832 _("Number of document lines (%d) "
833 "must be greater than 0 and less than %d."),
834 n_lines, INT_MAX / DOC_LINE_LENGTH);
836 record->n_lines = n_lines;
837 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
838 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
844 read_extension_record_header (struct sfm_reader *r, int subtype,
845 struct sfm_extension_record *record)
847 record->pos = r->pos;
848 record->size = read_int (r);
849 record->count = read_int (r);
851 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
852 allows an extra byte for a null terminator, used by some
853 extension processing routines. */
854 if (record->size != 0
855 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
856 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
859 /* Reads an extension record from R into RECORD. */
860 static struct sfm_extension_record *
861 read_extension_record (struct sfm_reader *r, int subtype)
863 struct extension_record_type
870 static const struct extension_record_type types[] =
872 /* Implemented record types. */
873 { EXT_INTEGER, 4, 8 },
875 { EXT_MRSETS, 1, 0 },
876 { EXT_DISPLAY, 4, 0 },
877 { EXT_LONG_NAMES, 1, 0 },
878 { EXT_LONG_STRINGS, 1, 0 },
879 { EXT_NCASES, 8, 2 },
880 { EXT_FILE_ATTRS, 1, 0 },
881 { EXT_VAR_ATTRS, 1, 0 },
882 { EXT_MRSETS2, 1, 0 },
883 { EXT_ENCODING, 1, 0 },
884 { EXT_LONG_LABELS, 1, 0 },
886 /* Ignored record types. */
887 { EXT_VAR_SETS, 0, 0 },
889 { EXT_DATA_ENTRY, 0, 0 },
892 const struct extension_record_type *type;
893 struct sfm_extension_record *record;
896 record = pool_malloc (r->pool, sizeof *record);
897 read_extension_record_header (r, subtype, record);
898 n_bytes = record->count * record->size;
900 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
901 if (subtype == type->subtype)
903 if (type->size > 0 && record->size != type->size)
904 sys_warn (r, record->pos,
905 _("Record type 7, subtype %d has bad size %zu "
906 "(expected %d)."), subtype, record->size, type->size);
907 else if (type->count > 0 && record->count != type->count)
908 sys_warn (r, record->pos,
909 _("Record type 7, subtype %d has bad count %zu "
910 "(expected %d)."), subtype, record->count, type->count);
911 else if (type->count == 0 && type->size == 0)
913 /* Ignore this record. */
917 char *data = pool_malloc (r->pool, n_bytes + 1);
918 data[n_bytes] = '\0';
921 read_bytes (r, record->data, n_bytes);
928 sys_warn (r, record->pos,
929 _("Unrecognized record type 7, subtype %d. Please send a "
930 "copy of this file, and the syntax which created it to %s."),
931 subtype, PACKAGE_BUGREPORT);
934 skip_bytes (r, n_bytes);
939 skip_extension_record (struct sfm_reader *r, int subtype)
941 struct sfm_extension_record record;
943 read_extension_record_header (r, subtype, &record);
944 skip_bytes (r, record.count * record.size);
948 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
949 struct sfm_read_info *info, struct dictionary *dict)
951 const char *dict_encoding = dict_get_encoding (dict);
952 struct substring product;
953 struct substring label;
955 /* Convert file label to UTF-8 and put it into DICT. */
956 label = recode_substring_pool ("UTF-8", dict_encoding,
957 ss_cstr (header->file_label), r->pool);
958 ss_trim (&label, ss_cstr (" "));
959 label.string[label.length] = '\0';
960 dict_set_label (dict, label.string);
962 /* Put creation date and time in UTF-8 into INFO. */
963 info->creation_date = recode_string ("UTF-8", dict_encoding,
964 header->creation_date, -1);
965 info->creation_time = recode_string ("UTF-8", dict_encoding,
966 header->creation_time, -1);
968 /* Put product name into INFO, dropping eye-catcher string if present. */
969 product = recode_substring_pool ("UTF-8", dict_encoding,
970 ss_cstr (header->eye_catcher), r->pool);
971 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
972 ss_trim (&product, ss_cstr (" "));
973 info->product = ss_xstrdup (product);
976 /* Reads a variable (type 2) record from R and adds the
977 corresponding variable to DICT.
978 Also skips past additional variable records for long string
981 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
982 struct sfm_var_record *var_recs, size_t n_var_recs)
984 const char *dict_encoding = dict_get_encoding (dict);
985 struct sfm_var_record *rec;
988 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
990 struct variable *var;
995 name = recode_string_pool ("UTF-8", dict_encoding,
996 rec->name, 8, r->pool);
997 name[strcspn (name, " ")] = '\0';
999 if (!dict_id_is_valid (dict, name, false)
1000 || name[0] == '$' || name[0] == '#')
1001 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1003 if (rec->width < 0 || rec->width > 255)
1004 sys_error (r, rec->pos,
1005 _("Bad width %d for variable %s."), rec->width, name);
1007 var = rec->var = dict_create_var (dict, name, rec->width);
1009 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
1011 /* Set the short name the same as the long name. */
1012 var_set_short_name (var, 0, name);
1014 /* Get variable label, if any. */
1019 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1020 rec->label, -1, r->pool);
1021 var_set_label (var, utf8_label, false);
1024 /* Set missing values. */
1025 if (rec->missing_value_code != 0)
1027 int width = var_get_width (var);
1028 struct missing_values mv;
1030 mv_init_pool (r->pool, &mv, width);
1031 if (var_is_numeric (var))
1033 bool has_range = rec->missing_value_code < 0;
1034 int n_discrete = (has_range
1035 ? rec->missing_value_code == -3
1036 : rec->missing_value_code);
1041 double low = parse_float (r, rec->missing, 0);
1042 double high = parse_float (r, rec->missing, 8);
1044 /* Deal with SPSS 21 change in representation. */
1048 mv_add_range (&mv, low, high);
1052 for (i = 0; i < n_discrete; i++)
1054 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1062 value_init_pool (r->pool, &value, width);
1063 value_set_missing (&value, width);
1064 for (i = 0; i < rec->missing_value_code; i++)
1066 uint8_t *s = value_str_rw (&value, width);
1067 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1068 mv_add_str (&mv, s);
1071 var_set_missing_values (var, &mv);
1075 parse_format_spec (r, rec->pos + 12, rec->print_format,
1076 PRINT_FORMAT, var, &n_warnings);
1077 parse_format_spec (r, rec->pos + 16, rec->write_format,
1078 WRITE_FORMAT, var, &n_warnings);
1080 /* Account for values.
1081 Skip long string continuation records, if any. */
1082 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1083 for (i = 1; i < n_values; i++)
1084 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1085 sys_error (r, rec->pos, _("Missing string continuation record."));
1090 /* Translates the format spec from sysfile format to internal
1093 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1094 enum which_format which, struct variable *v,
1097 const int max_warnings = 8;
1098 uint8_t raw_type = format >> 16;
1099 uint8_t w = format >> 8;
1108 ok = (fmt_from_io (raw_type, &f.type)
1109 && fmt_check_output (&f)
1110 && fmt_check_width_compat (&f, var_get_width (v)));
1115 if (which == PRINT_FORMAT)
1116 var_set_print_format (v, &f);
1118 var_set_write_format (v, &f);
1120 else if (format == 0)
1122 /* Actually observed in the wild. No point in warning about it. */
1124 else if (++*n_warnings <= max_warnings)
1126 if (which == PRINT_FORMAT)
1127 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1129 var_get_name (v), var_get_width (v), format);
1131 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1133 var_get_name (v), var_get_width (v), format);
1135 if (*n_warnings == max_warnings)
1136 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1141 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1145 for (p = record->documents;
1146 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1147 p += DOC_LINE_LENGTH)
1149 struct substring line;
1151 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1152 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1153 ss_rtrim (&line, ss_cstr (" "));
1154 line.string[line.length] = '\0';
1156 dict_add_document_line (dict, line.string, false);
1162 /* Parses record type 7, subtype 3. */
1164 parse_machine_integer_info (struct sfm_reader *r,
1165 const struct sfm_extension_record *record,
1166 struct sfm_read_info *info)
1168 int float_representation, expected_float_format;
1169 int integer_representation, expected_integer_format;
1171 /* Save version info. */
1172 info->version_major = parse_int (r, record->data, 0);
1173 info->version_minor = parse_int (r, record->data, 4);
1174 info->version_revision = parse_int (r, record->data, 8);
1176 /* Check floating point format. */
1177 float_representation = parse_int (r, record->data, 16);
1178 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1179 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1180 expected_float_format = 1;
1181 else if (r->float_format == FLOAT_Z_LONG)
1182 expected_float_format = 2;
1183 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1184 expected_float_format = 3;
1187 if (float_representation != expected_float_format)
1188 sys_error (r, record->pos, _("Floating-point representation indicated by "
1189 "system file (%d) differs from expected (%d)."),
1190 float_representation, expected_float_format);
1192 /* Check integer format. */
1193 integer_representation = parse_int (r, record->data, 24);
1194 if (r->integer_format == INTEGER_MSB_FIRST)
1195 expected_integer_format = 1;
1196 else if (r->integer_format == INTEGER_LSB_FIRST)
1197 expected_integer_format = 2;
1200 if (integer_representation != expected_integer_format)
1201 sys_warn (r, record->pos,
1202 _("Integer format indicated by system file (%d) "
1203 "differs from expected (%d)."),
1204 integer_representation, expected_integer_format);
1209 choose_encoding (struct sfm_reader *r,
1210 const struct sfm_header_record *header,
1211 const struct sfm_extension_record *ext_integer,
1212 const struct sfm_extension_record *ext_encoding)
1214 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1217 return ext_encoding->data;
1219 /* But EXT_INTEGER is better than nothing as a fallback. */
1222 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1223 const char *encoding;
1232 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1233 respectively. However, there are known to be many files in the wild
1234 with character code 2, yet have data which are clearly not ASCII.
1235 Therefore we ignore these values. */
1242 encoding = sys_get_encoding_from_codepage (codepage);
1243 if (encoding != NULL)
1249 /* If the file magic number is EBCDIC then its character data is too. */
1250 if (!strcmp (header->magic, EBCDIC_MAGIC))
1253 return locale_charset ();
1256 /* Parses record type 7, subtype 4. */
1258 parse_machine_float_info (struct sfm_reader *r,
1259 const struct sfm_extension_record *record)
1261 double sysmis = parse_float (r, record->data, 0);
1262 double highest = parse_float (r, record->data, 8);
1263 double lowest = parse_float (r, record->data, 16);
1265 if (sysmis != SYSMIS)
1266 sys_warn (r, record->pos,
1267 _("File specifies unexpected value %g (%a) as %s, "
1268 "instead of %g (%a)."),
1269 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1271 if (highest != HIGHEST)
1272 sys_warn (r, record->pos,
1273 _("File specifies unexpected value %g (%a) as %s, "
1274 "instead of %g (%a)."),
1275 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1277 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1278 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1279 appears in a context (missing values) where SYSMIS cannot. */
1280 if (lowest != LOWEST && lowest != SYSMIS)
1281 sys_warn (r, record->pos,
1282 _("File specifies unexpected value %g (%a) as %s, "
1283 "instead of %g (%a) or %g (%a)."),
1284 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1287 /* Parses record type 7, subtype 7 or 19. */
1289 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1290 struct dictionary *dict)
1292 struct text_record *text;
1293 struct mrset *mrset;
1295 text = open_text_record (r, record, false);
1298 const char *counted = NULL;
1301 struct stringi_set var_names;
1302 size_t allocated_vars;
1306 mrset = xzalloc (sizeof *mrset);
1308 name = text_get_token (text, ss_cstr ("="), NULL);
1311 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1313 if (mrset->name[0] != '$')
1315 sys_warn (r, record->pos,
1316 _("`%s' does not begin with `$' at offset %zu "
1317 "in MRSETS record."), mrset->name, text_pos (text));
1321 if (text_match (text, 'C'))
1323 mrset->type = MRSET_MC;
1324 if (!text_match (text, ' '))
1326 sys_warn (r, record->pos,
1327 _("Missing space following `%c' at offset %zu "
1328 "in MRSETS record."), 'C', text_pos (text));
1332 else if (text_match (text, 'D'))
1334 mrset->type = MRSET_MD;
1335 mrset->cat_source = MRSET_VARLABELS;
1337 else if (text_match (text, 'E'))
1341 mrset->type = MRSET_MD;
1342 mrset->cat_source = MRSET_COUNTEDVALUES;
1343 if (!text_match (text, ' '))
1345 sys_warn (r, record->pos,
1346 _("Missing space following `%c' at offset %zu "
1347 "in MRSETS record."), 'E', text_pos (text));
1351 number = text_get_token (text, ss_cstr (" "), NULL);
1352 if (!strcmp (number, "11"))
1353 mrset->label_from_var_label = true;
1354 else if (strcmp (number, "1"))
1355 sys_warn (r, record->pos,
1356 _("Unexpected label source value `%s' following `E' "
1357 "at offset %zu in MRSETS record."),
1358 number, text_pos (text));
1362 sys_warn (r, record->pos,
1363 _("Missing `C', `D', or `E' at offset %zu "
1364 "in MRSETS record."),
1369 if (mrset->type == MRSET_MD)
1371 counted = text_parse_counted_string (r, text);
1372 if (counted == NULL)
1376 label = text_parse_counted_string (r, text);
1379 if (label[0] != '\0')
1380 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1382 stringi_set_init (&var_names);
1387 const char *raw_var_name;
1388 struct variable *var;
1391 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1392 if (raw_var_name == NULL)
1394 sys_warn (r, record->pos,
1395 _("Missing new-line parsing variable names "
1396 "at offset %zu in MRSETS record."),
1400 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1402 var = dict_lookup_var (dict, var_name);
1408 if (!stringi_set_insert (&var_names, var_name))
1410 sys_warn (r, record->pos,
1411 _("Duplicate variable name %s "
1412 "at offset %zu in MRSETS record."),
1413 var_name, text_pos (text));
1419 if (mrset->label == NULL && mrset->label_from_var_label
1420 && var_has_label (var))
1421 mrset->label = xstrdup (var_get_label (var));
1424 && var_get_type (var) != var_get_type (mrset->vars[0]))
1426 sys_warn (r, record->pos,
1427 _("MRSET %s contains both string and "
1428 "numeric variables."), name);
1431 width = MIN (width, var_get_width (var));
1433 if (mrset->n_vars >= allocated_vars)
1434 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1435 sizeof *mrset->vars);
1436 mrset->vars[mrset->n_vars++] = var;
1438 while (delimiter != '\n');
1440 if (mrset->n_vars < 2)
1442 sys_warn (r, record->pos,
1443 _("MRSET %s has only %zu variables."), mrset->name,
1445 mrset_destroy (mrset);
1446 stringi_set_destroy (&var_names);
1450 if (mrset->type == MRSET_MD)
1452 mrset->width = width;
1453 value_init (&mrset->counted, width);
1455 mrset->counted.f = c_strtod (counted, NULL);
1457 value_copy_str_rpad (&mrset->counted, width,
1458 (const uint8_t *) counted, ' ');
1461 dict_add_mrset (dict, mrset);
1463 stringi_set_destroy (&var_names);
1465 mrset_destroy (mrset);
1466 close_text_record (r, text);
1469 /* Read record type 7, subtype 11, which specifies how variables
1470 should be displayed in GUI environments. */
1472 parse_display_parameters (struct sfm_reader *r,
1473 const struct sfm_extension_record *record,
1474 struct dictionary *dict)
1476 bool includes_width;
1477 bool warned = false;
1482 n_vars = dict_get_var_cnt (dict);
1483 if (record->count == 3 * n_vars)
1484 includes_width = true;
1485 else if (record->count == 2 * n_vars)
1486 includes_width = false;
1489 sys_warn (r, record->pos,
1490 _("Extension 11 has bad count %zu (for %zu variables)."),
1491 record->count, n_vars);
1496 for (i = 0; i < n_vars; ++i)
1498 struct variable *v = dict_get_var (dict, i);
1499 int measure, width, align;
1501 measure = parse_int (r, record->data, ofs);
1506 width = parse_int (r, record->data, ofs);
1512 align = parse_int (r, record->data, ofs);
1515 /* SPSS 14 sometimes seems to set string variables' measure
1517 if (0 == measure && var_is_alpha (v))
1520 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1523 sys_warn (r, record->pos,
1524 _("Invalid variable display parameters for variable "
1525 "%zu (%s). Default parameters substituted."),
1526 i, var_get_name (v));
1531 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1532 : measure == 2 ? MEASURE_ORDINAL
1534 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1535 : align == 1 ? ALIGN_RIGHT
1538 /* Older versions (SPSS 9.0) sometimes set the display
1539 width to zero. This causes confusion in the GUI, so
1540 only set the width if it is nonzero. */
1542 var_set_display_width (v, width);
1547 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1548 const char *new_name)
1550 size_t n_short_names;
1554 /* Renaming a variable may clear its short names, but we
1555 want to retain them, so we save them and re-set them
1557 n_short_names = var_get_short_name_cnt (var);
1558 short_names = xnmalloc (n_short_names, sizeof *short_names);
1559 for (i = 0; i < n_short_names; i++)
1561 const char *s = var_get_short_name (var, i);
1562 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1565 /* Set long name. */
1566 dict_rename_var (dict, var, new_name);
1568 /* Restore short names. */
1569 for (i = 0; i < n_short_names; i++)
1571 var_set_short_name (var, i, short_names[i]);
1572 free (short_names[i]);
1577 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1578 to each short name. Modifies variable names in DICT accordingly. */
1580 parse_long_var_name_map (struct sfm_reader *r,
1581 const struct sfm_extension_record *record,
1582 struct dictionary *dict)
1584 struct text_record *text;
1585 struct variable *var;
1590 /* There are no long variable names. Use the short variable names,
1591 converted to lowercase, as the long variable names. */
1594 for (i = 0; i < dict_get_var_cnt (dict); i++)
1596 struct variable *var = dict_get_var (dict, i);
1599 new_name = utf8_to_lower (var_get_name (var));
1600 rename_var_and_save_short_names (dict, var, new_name);
1607 /* Rename each of the variables, one by one. (In a correctly constructed
1608 system file, this cannot create any intermediate duplicate variable names,
1609 because all of the new variable names are longer than any of the old
1610 variable names and thus there cannot be any overlaps.) */
1611 text = open_text_record (r, record, true);
1612 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1614 /* Validate long name. */
1615 if (!dict_id_is_valid (dict, long_name, false))
1617 sys_warn (r, record->pos,
1618 _("Long variable mapping from %s to invalid "
1619 "variable name `%s'."),
1620 var_get_name (var), long_name);
1624 /* Identify any duplicates. */
1625 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1626 && dict_lookup_var (dict, long_name) != NULL)
1628 sys_warn (r, record->pos,
1629 _("Duplicate long variable name `%s'."), long_name);
1633 rename_var_and_save_short_names (dict, var, long_name);
1635 close_text_record (r, text);
1638 /* Reads record type 7, subtype 14, which gives the real length
1639 of each very long string. Rearranges DICT accordingly. */
1641 parse_long_string_map (struct sfm_reader *r,
1642 const struct sfm_extension_record *record,
1643 struct dictionary *dict)
1645 struct text_record *text;
1646 struct variable *var;
1649 text = open_text_record (r, record, true);
1650 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1652 size_t idx = var_get_dict_index (var);
1658 length = strtol (length_s, NULL, 10);
1659 if (length < 1 || length > MAX_STRING)
1661 sys_warn (r, record->pos,
1662 _("%s listed as string of invalid length %s "
1663 "in very long string record."),
1664 var_get_name (var), length_s);
1668 /* Check segments. */
1669 segment_cnt = sfm_width_to_segments (length);
1670 if (segment_cnt == 1)
1672 sys_warn (r, record->pos,
1673 _("%s listed in very long string record with width %s, "
1674 "which requires only one segment."),
1675 var_get_name (var), length_s);
1678 if (idx + segment_cnt > dict_get_var_cnt (dict))
1679 sys_error (r, record->pos,
1680 _("Very long string %s overflows dictionary."),
1681 var_get_name (var));
1683 /* Get the short names from the segments and check their
1685 for (i = 0; i < segment_cnt; i++)
1687 struct variable *seg = dict_get_var (dict, idx + i);
1688 int alloc_width = sfm_segment_alloc_width (length, i);
1689 int width = var_get_width (seg);
1692 var_set_short_name (var, i, var_get_short_name (seg, 0));
1693 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1694 sys_error (r, record->pos,
1695 _("Very long string with width %ld has segment %d "
1696 "of width %d (expected %d)."),
1697 length, i, width, alloc_width);
1699 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1700 var_set_width (var, length);
1702 close_text_record (r, text);
1703 dict_compact_values (dict);
1707 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1708 const struct sfm_var_record *var_recs, size_t n_var_recs,
1709 const struct sfm_value_label_record *record)
1711 struct variable **vars;
1715 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1716 for (i = 0; i < record->n_labels; i++)
1717 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1718 record->labels[i].label, -1,
1721 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1722 for (i = 0; i < record->n_vars; i++)
1723 vars[i] = lookup_var_by_index (r, record->pos,
1724 var_recs, n_var_recs, record->vars[i]);
1726 for (i = 1; i < record->n_vars; i++)
1727 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1728 sys_error (r, record->pos,
1729 _("Variables associated with value label are not all of "
1730 "identical type. Variable %s is %s, but variable "
1732 var_get_name (vars[0]),
1733 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1734 var_get_name (vars[i]),
1735 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1737 for (i = 0; i < record->n_vars; i++)
1739 struct variable *var = vars[i];
1743 width = var_get_width (var);
1745 sys_error (r, record->pos,
1746 _("Value labels may not be added to long string "
1747 "variables (e.g. %s) using records types 3 and 4."),
1748 var_get_name (var));
1750 for (j = 0; j < record->n_labels; j++)
1752 struct sfm_value_label *label = &record->labels[j];
1755 value_init (&value, width);
1757 value.f = parse_float (r, label->value, 0);
1759 memcpy (value_str_rw (&value, width), label->value, width);
1761 if (!var_add_value_label (var, &value, utf8_labels[j]))
1763 if (var_is_numeric (var))
1764 sys_warn (r, record->pos,
1765 _("Duplicate value label for %g on %s."),
1766 value.f, var_get_name (var));
1768 sys_warn (r, record->pos,
1769 _("Duplicate value label for `%.*s' on %s."),
1770 width, value_str (&value, width),
1771 var_get_name (var));
1774 value_destroy (&value, width);
1778 pool_free (r->pool, vars);
1779 for (i = 0; i < record->n_labels; i++)
1780 pool_free (r->pool, utf8_labels[i]);
1781 pool_free (r->pool, utf8_labels);
1784 static struct variable *
1785 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1786 const struct sfm_var_record *var_recs, size_t n_var_recs,
1789 const struct sfm_var_record *rec;
1791 if (idx < 1 || idx > n_var_recs)
1793 sys_error (r, offset,
1794 _("Variable index %d not in valid range 1...%zu."),
1799 rec = &var_recs[idx - 1];
1800 if (rec->var == NULL)
1802 sys_error (r, offset,
1803 _("Variable index %d refers to long string continuation."),
1811 /* Parses a set of custom attributes from TEXT into ATTRS.
1812 ATTRS may be a null pointer, in which case the attributes are
1813 read but discarded. */
1815 parse_attributes (struct sfm_reader *r, struct text_record *text,
1816 struct attrset *attrs)
1820 struct attribute *attr;
1824 /* Parse the key. */
1825 key = text_get_token (text, ss_cstr ("("), NULL);
1829 attr = attribute_create (key);
1830 for (index = 1; ; index++)
1832 /* Parse the value. */
1836 value = text_get_token (text, ss_cstr ("\n"), NULL);
1839 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1844 length = strlen (value);
1845 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1847 value[length - 1] = '\0';
1848 attribute_add_value (attr, value + 1);
1853 _("Attribute value %s[%d] is not quoted: %s."),
1855 attribute_add_value (attr, value);
1858 /* Was this the last value for this attribute? */
1859 if (text_match (text, ')'))
1863 attrset_add (attrs, attr);
1865 attribute_destroy (attr);
1867 while (!text_match (text, '/'));
1870 /* Reads record type 7, subtype 17, which lists custom
1871 attributes on the data file. */
1873 parse_data_file_attributes (struct sfm_reader *r,
1874 const struct sfm_extension_record *record,
1875 struct dictionary *dict)
1877 struct text_record *text = open_text_record (r, record, true);
1878 parse_attributes (r, text, dict_get_attributes (dict));
1879 close_text_record (r, text);
1882 /* Parses record type 7, subtype 18, which lists custom
1883 attributes on individual variables. */
1885 parse_variable_attributes (struct sfm_reader *r,
1886 const struct sfm_extension_record *record,
1887 struct dictionary *dict)
1889 struct text_record *text;
1890 struct variable *var;
1892 text = open_text_record (r, record, true);
1893 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1894 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1895 close_text_record (r, text);
1899 check_overflow (struct sfm_reader *r,
1900 const struct sfm_extension_record *record,
1901 size_t ofs, size_t length)
1903 size_t end = record->size * record->count;
1904 if (length >= end || ofs + length > end)
1905 sys_error (r, record->pos + end,
1906 _("Long string value label record ends unexpectedly."));
1910 parse_long_string_value_labels (struct sfm_reader *r,
1911 const struct sfm_extension_record *record,
1912 struct dictionary *dict)
1914 const char *dict_encoding = dict_get_encoding (dict);
1915 size_t end = record->size * record->count;
1922 struct variable *var;
1927 /* Parse variable name length. */
1928 check_overflow (r, record, ofs, 4);
1929 var_name_len = parse_int (r, record->data, ofs);
1932 /* Parse variable name, width, and number of labels. */
1933 check_overflow (r, record, ofs, var_name_len + 8);
1934 var_name = recode_string_pool ("UTF-8", dict_encoding,
1935 (const char *) record->data + ofs,
1936 var_name_len, r->pool);
1937 width = parse_int (r, record->data, ofs + var_name_len);
1938 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1939 ofs += var_name_len + 8;
1941 /* Look up 'var' and validate. */
1942 var = dict_lookup_var (dict, var_name);
1944 sys_warn (r, record->pos + ofs,
1945 _("Ignoring long string value record for "
1946 "unknown variable %s."), var_name);
1947 else if (var_is_numeric (var))
1949 sys_warn (r, record->pos + ofs,
1950 _("Ignoring long string value record for "
1951 "numeric variable %s."), var_name);
1954 else if (width != var_get_width (var))
1956 sys_warn (r, record->pos + ofs,
1957 _("Ignoring long string value record for variable %s "
1958 "because the record's width (%d) does not match the "
1959 "variable's width (%d)."),
1960 var_name, width, var_get_width (var));
1965 value_init_pool (r->pool, &value, width);
1966 for (i = 0; i < n_labels; i++)
1968 size_t value_length, label_length;
1969 bool skip = var == NULL;
1971 /* Parse value length. */
1972 check_overflow (r, record, ofs, 4);
1973 value_length = parse_int (r, record->data, ofs);
1977 check_overflow (r, record, ofs, value_length);
1980 if (value_length == width)
1981 memcpy (value_str_rw (&value, width),
1982 (const uint8_t *) record->data + ofs, width);
1985 sys_warn (r, record->pos + ofs,
1986 _("Ignoring long string value %zu for variable "
1987 "%s, with width %d, that has bad value "
1989 i, var_get_name (var), width, value_length);
1993 ofs += value_length;
1995 /* Parse label length. */
1996 check_overflow (r, record, ofs, 4);
1997 label_length = parse_int (r, record->data, ofs);
2001 check_overflow (r, record, ofs, label_length);
2006 label = recode_string_pool ("UTF-8", dict_encoding,
2007 (const char *) record->data + ofs,
2008 label_length, r->pool);
2009 if (!var_add_value_label (var, &value, label))
2010 sys_warn (r, record->pos + ofs,
2011 _("Duplicate value label for `%.*s' on %s."),
2012 width, value_str (&value, width),
2013 var_get_name (var));
2014 pool_free (r->pool, label);
2016 ofs += label_length;
2023 static void partial_record (struct sfm_reader *r)
2026 static void read_error (struct casereader *, const struct sfm_reader *);
2028 static bool read_case_number (struct sfm_reader *, double *);
2029 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2030 static int read_opcode (struct sfm_reader *);
2031 static bool read_compressed_number (struct sfm_reader *, double *);
2032 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2033 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2034 static bool skip_whole_strings (struct sfm_reader *, size_t);
2036 /* Reads and returns one case from READER's file. Returns a null
2037 pointer if not successful. */
2038 static struct ccase *
2039 sys_file_casereader_read (struct casereader *reader, void *r_)
2041 struct sfm_reader *r = r_;
2042 struct ccase *volatile c;
2048 c = case_create (r->proto);
2049 if (setjmp (r->bail_out))
2051 casereader_force_error (reader);
2056 for (i = 0; i < r->sfm_var_cnt; i++)
2058 struct sfm_var *sv = &r->sfm_vars[i];
2059 union value *v = case_data_rw_idx (c, sv->case_index);
2061 if (sv->var_width == 0)
2063 if (!read_case_number (r, &v->f))
2068 uint8_t *s = value_str_rw (v, sv->var_width);
2069 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2071 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2080 if (r->case_cnt != -1)
2081 read_error (reader, r);
2086 /* Issues an error that R ends in a partial record. */
2088 partial_record (struct sfm_reader *r)
2090 sys_error (r, r->pos, _("File ends in partial case."));
2093 /* Issues an error that an unspecified error occurred SFM, and
2096 read_error (struct casereader *r, const struct sfm_reader *sfm)
2098 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2099 casereader_force_error (r);
2102 /* Reads a number from R and stores its value in *D.
2103 If R is compressed, reads a compressed number;
2104 otherwise, reads a number in the regular way.
2105 Returns true if successful, false if end of file is
2106 reached immediately. */
2108 read_case_number (struct sfm_reader *r, double *d)
2113 if (!try_read_bytes (r, number, sizeof number))
2115 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2119 return read_compressed_number (r, d);
2122 /* Reads LENGTH string bytes from R into S.
2123 Always reads a multiple of 8 bytes; if LENGTH is not a
2124 multiple of 8, then extra bytes are read and discarded without
2126 Reads compressed strings if S is compressed.
2127 Returns true if successful, false if end of file is
2128 reached immediately. */
2130 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2132 size_t whole = ROUND_DOWN (length, 8);
2133 size_t partial = length % 8;
2137 if (!read_whole_strings (r, s, whole))
2144 if (!read_whole_strings (r, bounce, sizeof bounce))
2150 memcpy (s + whole, bounce, partial);
2156 /* Reads and returns the next compression opcode from R. */
2158 read_opcode (struct sfm_reader *r)
2160 assert (r->compressed);
2164 if (r->opcode_idx >= sizeof r->opcodes)
2166 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2170 opcode = r->opcodes[r->opcode_idx++];
2177 /* Reads a compressed number from R and stores its value in D.
2178 Returns true if successful, false if end of file is
2179 reached immediately. */
2181 read_compressed_number (struct sfm_reader *r, double *d)
2183 int opcode = read_opcode (r);
2191 *d = read_float (r);
2195 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2196 if (!r->corruption_warning)
2198 r->corruption_warning = true;
2199 sys_warn (r, r->pos,
2200 _("Possible compressed data corruption: "
2201 "compressed spaces appear in numeric field."));
2210 *d = opcode - r->bias;
2217 /* Reads a compressed 8-byte string segment from R and stores it
2219 Returns true if successful, false if end of file is
2220 reached immediately. */
2222 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2224 int opcode = read_opcode (r);
2232 read_bytes (r, dst, 8);
2236 memset (dst, ' ', 8);
2241 double value = opcode - r->bias;
2242 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2245 /* This has actually been seen "in the wild". The submitter of the
2246 file that showed that the contents decoded as spaces, but they
2247 were at the end of the field so it's possible that the null
2248 bytes just acted as null terminators. */
2250 else if (!r->corruption_warning)
2252 r->corruption_warning = true;
2253 sys_warn (r, r->pos,
2254 _("Possible compressed data corruption: "
2255 "string contains compressed integer (opcode %d)."),
2265 /* Reads LENGTH string bytes from R into S.
2266 LENGTH must be a multiple of 8.
2267 Reads compressed strings if S is compressed.
2268 Returns true if successful, false if end of file is
2269 reached immediately. */
2271 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2273 assert (length % 8 == 0);
2275 return try_read_bytes (r, s, length);
2279 for (ofs = 0; ofs < length; ofs += 8)
2280 if (!read_compressed_string (r, s + ofs))
2290 /* Skips LENGTH string bytes from R.
2291 LENGTH must be a multiple of 8.
2292 (LENGTH is also limited to 1024, but that's only because the
2293 current caller never needs more than that many bytes.)
2294 Returns true if successful, false if end of file is
2295 reached immediately. */
2297 skip_whole_strings (struct sfm_reader *r, size_t length)
2299 uint8_t buffer[1024];
2300 assert (length < sizeof buffer);
2301 return read_whole_strings (r, buffer, length);
2304 /* Helpers for reading records that contain structured text
2307 /* Maximum number of warnings to issue for a single text
2309 #define MAX_TEXT_WARNINGS 5
2314 struct substring buffer; /* Record contents. */
2315 off_t start; /* Starting offset in file. */
2316 size_t pos; /* Current position in buffer. */
2317 int n_warnings; /* Number of warnings issued or suppressed. */
2318 bool recoded; /* Recoded into UTF-8? */
2321 static struct text_record *
2322 open_text_record (struct sfm_reader *r,
2323 const struct sfm_extension_record *record,
2324 bool recode_to_utf8)
2326 struct text_record *text;
2327 struct substring raw;
2329 text = pool_alloc (r->pool, sizeof *text);
2330 raw = ss_buffer (record->data, record->size * record->count);
2331 text->start = record->pos;
2332 text->buffer = (recode_to_utf8
2333 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2336 text->n_warnings = 0;
2337 text->recoded = recode_to_utf8;
2342 /* Closes TEXT, frees its storage, and issues a final warning
2343 about suppressed warnings if necesary. */
2345 close_text_record (struct sfm_reader *r, struct text_record *text)
2347 if (text->n_warnings > MAX_TEXT_WARNINGS)
2348 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2349 text->n_warnings - MAX_TEXT_WARNINGS);
2351 pool_free (r->pool, ss_data (text->buffer));
2354 /* Reads a variable=value pair from TEXT.
2355 Looks up the variable in DICT and stores it into *VAR.
2356 Stores a null-terminated value into *VALUE. */
2358 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2359 struct text_record *text,
2360 struct variable **var, char **value)
2364 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2367 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2371 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2372 ss_buffer ("\t\0", 2));
2380 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2381 struct text_record *text, struct substring delimiters,
2382 struct variable **var)
2386 name = text_get_token (text, delimiters, NULL);
2390 *var = dict_lookup_var (dict, name);
2394 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2401 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2402 struct text_record *text, struct substring delimiters,
2403 struct variable **var)
2405 char *short_name = text_get_token (text, delimiters, NULL);
2406 if (short_name == NULL)
2409 *var = dict_lookup_var (dict, short_name);
2411 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2416 /* Displays a warning for the current file position, limiting the
2417 number to MAX_TEXT_WARNINGS for TEXT. */
2419 text_warn (struct sfm_reader *r, struct text_record *text,
2420 const char *format, ...)
2422 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2426 va_start (args, format);
2427 sys_msg (r, text->start + text->pos, MW, format, args);
2433 text_get_token (struct text_record *text, struct substring delimiters,
2436 struct substring token;
2439 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2442 end = &ss_data (token)[ss_length (token)];
2443 if (delimiter != NULL)
2446 return ss_data (token);
2449 /* Reads a integer value expressed in decimal, then a space, then a string that
2450 consists of exactly as many bytes as specified by the integer, then a space,
2451 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2452 buffer (so the caller should not free the string). */
2454 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2462 while (text->pos < text->buffer.length)
2464 int c = text->buffer.string[text->pos];
2465 if (c < '0' || c > '9')
2467 n = (n * 10) + (c - '0');
2470 if (text->pos >= text->buffer.length || start == text->pos)
2472 sys_warn (r, text->start,
2473 _("Expecting digit at offset %zu in MRSETS record."),
2478 if (!text_match (text, ' '))
2480 sys_warn (r, text->start,
2481 _("Expecting space at offset %zu in MRSETS record."),
2486 if (text->pos + n > text->buffer.length)
2488 sys_warn (r, text->start,
2489 _("%zu-byte string starting at offset %zu "
2490 "exceeds record length %zu."),
2491 n, text->pos, text->buffer.length);
2495 s = &text->buffer.string[text->pos];
2498 sys_warn (r, text->start,
2499 _("Expecting space at offset %zu following %zu-byte string."),
2509 text_match (struct text_record *text, char c)
2511 if (text->buffer.string[text->pos] == c)
2520 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2521 inside the TEXT's string. */
2523 text_pos (const struct text_record *text)
2530 /* Displays a corruption message. */
2532 sys_msg (struct sfm_reader *r, off_t offset,
2533 int class, const char *format, va_list args)
2538 ds_init_empty (&text);
2540 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2541 fh_get_file_name (r->fh), (long long int) offset);
2543 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2544 ds_put_vformat (&text, format, args);
2546 m.category = msg_class_to_category (class);
2547 m.severity = msg_class_to_severity (class);
2553 m.text = ds_cstr (&text);
2558 /* Displays a warning for offset OFFSET in the file. */
2560 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2564 va_start (args, format);
2565 sys_msg (r, offset, MW, format, args);
2569 /* Displays an error for the current file position,
2570 marks it as in an error state,
2571 and aborts reading it using longjmp. */
2573 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2577 va_start (args, format);
2578 sys_msg (r, offset, ME, format, args);
2582 longjmp (r->bail_out, 1);
2585 /* Reads BYTE_CNT bytes into BUF.
2586 Returns true if exactly BYTE_CNT bytes are successfully read.
2587 Aborts if an I/O error or a partial read occurs.
2588 If EOF_IS_OK, then an immediate end-of-file causes false to be
2589 returned; otherwise, immediate end-of-file causes an abort
2592 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2593 void *buf, size_t byte_cnt)
2595 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2596 r->pos += bytes_read;
2597 if (bytes_read == byte_cnt)
2599 else if (ferror (r->file))
2600 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2601 else if (!eof_is_ok || bytes_read != 0)
2602 sys_error (r, r->pos, _("Unexpected end of file."));
2607 /* Reads BYTE_CNT into BUF.
2608 Aborts upon I/O error or if end-of-file is encountered. */
2610 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2612 read_bytes_internal (r, false, buf, byte_cnt);
2615 /* Reads BYTE_CNT bytes into BUF.
2616 Returns true if exactly BYTE_CNT bytes are successfully read.
2617 Returns false if an immediate end-of-file is encountered.
2618 Aborts if an I/O error or a partial read occurs. */
2620 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2622 return read_bytes_internal (r, true, buf, byte_cnt);
2625 /* Reads a 32-bit signed integer from R and returns its value in
2628 read_int (struct sfm_reader *r)
2631 read_bytes (r, integer, sizeof integer);
2632 return integer_get (r->integer_format, integer, sizeof integer);
2635 /* Reads a 64-bit floating-point number from R and returns its
2636 value in host format. */
2638 read_float (struct sfm_reader *r)
2641 read_bytes (r, number, sizeof number);
2642 return float_get_double (r->float_format, number);
2646 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2648 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2652 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2654 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2657 /* Reads exactly SIZE - 1 bytes into BUFFER
2658 and stores a null byte into BUFFER[SIZE - 1]. */
2660 read_string (struct sfm_reader *r, char *buffer, size_t size)
2663 read_bytes (r, buffer, size - 1);
2664 buffer[size - 1] = '\0';
2667 /* Skips BYTES bytes forward in R. */
2669 skip_bytes (struct sfm_reader *r, size_t bytes)
2674 size_t chunk = MIN (sizeof buffer, bytes);
2675 read_bytes (r, buffer, chunk);
2680 static const struct casereader_class sys_file_casereader_class =
2682 sys_file_casereader_read,
2683 sys_file_casereader_destroy,