1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
88 /* Fields from the top-level header record. */
89 struct sfm_header_record
91 char magic[5]; /* First 4 bytes of file, then null. */
92 int weight_idx; /* 0 if unweighted, otherwise a var index. */
93 int nominal_case_size; /* Number of var positions. */
95 /* These correspond to the members of struct sfm_file_info or a dictionary
96 but in the system file's encoding rather than ASCII. */
97 char creation_date[10]; /* "dd mmm yy". */
98 char creation_time[9]; /* "hh:mm:ss". */
99 char eye_catcher[61]; /* Eye-catcher string, then product name. */
100 char file_label[65]; /* File label. */
103 struct sfm_var_record
110 int missing_value_code;
113 struct variable *var;
116 struct sfm_value_label
122 struct sfm_value_label_record
125 struct sfm_value_label *labels;
132 struct sfm_document_record
139 struct sfm_extension_record
141 off_t pos; /* Starting offset in file. */
142 size_t size; /* Size of data elements. */
143 size_t count; /* Number of data elements. */
144 void *data; /* Contents. */
147 /* System file reader. */
150 /* Resource tracking. */
151 struct pool *pool; /* All system file state. */
152 jmp_buf bail_out; /* longjmp() target for error handling. */
155 struct file_handle *fh; /* File handle. */
156 struct fh_lock *lock; /* Mutual exclusion for file handle. */
157 FILE *file; /* File stream. */
158 off_t pos; /* Position in file. */
159 bool error; /* I/O or corruption error? */
160 struct caseproto *proto; /* Format of output cases. */
163 enum integer_format integer_format; /* On-disk integer format. */
164 enum float_format float_format; /* On-disk floating point format. */
165 struct sfm_var *sfm_vars; /* Variables. */
166 size_t sfm_var_cnt; /* Number of variables. */
167 casenumber case_cnt; /* Number of cases */
168 const char *encoding; /* String encoding. */
171 bool compressed; /* File is compressed? */
172 double bias; /* Compression bias, usually 100.0. */
173 uint8_t opcodes[8]; /* Current block of opcodes. */
174 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
175 bool corruption_warning; /* Warned about possible corruption? */
178 static const struct casereader_class sys_file_casereader_class;
180 static bool close_reader (struct sfm_reader *);
182 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
183 const struct sfm_var_record *,
186 static void sys_msg (struct sfm_reader *r, off_t, int class,
187 const char *format, va_list args)
188 PRINTF_FORMAT (4, 0);
189 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
190 PRINTF_FORMAT (3, 4);
191 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
195 static void read_bytes (struct sfm_reader *, void *, size_t);
196 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
197 static int read_int (struct sfm_reader *);
198 static double read_float (struct sfm_reader *);
199 static void read_string (struct sfm_reader *, char *, size_t);
200 static void skip_bytes (struct sfm_reader *, size_t);
202 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
203 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
205 static void read_variable_record (struct sfm_reader *,
206 struct sfm_var_record *);
207 static void read_value_label_record (struct sfm_reader *,
208 struct sfm_value_label_record *,
210 static struct sfm_document_record *read_document_record (struct sfm_reader *);
211 static struct sfm_extension_record *read_extension_record (
212 struct sfm_reader *, int subtype);
213 static void skip_extension_record (struct sfm_reader *, int subtype);
215 static const char *choose_encoding (
217 const struct sfm_header_record *,
218 const struct sfm_extension_record *ext_integer,
219 const struct sfm_extension_record *ext_encoding);
221 static struct text_record *open_text_record (
222 struct sfm_reader *, const struct sfm_extension_record *,
223 bool recode_to_utf8);
224 static void close_text_record (struct sfm_reader *,
225 struct text_record *);
226 static bool read_variable_to_value_pair (struct sfm_reader *,
228 struct text_record *,
229 struct variable **var, char **value);
230 static void text_warn (struct sfm_reader *r, struct text_record *text,
231 const char *format, ...)
232 PRINTF_FORMAT (3, 4);
233 static char *text_get_token (struct text_record *,
234 struct substring delimiters, char *delimiter);
235 static bool text_match (struct text_record *, char c);
236 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
237 struct text_record *,
238 struct substring delimiters,
240 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
241 struct text_record *,
242 struct substring delimiters,
244 static const char *text_parse_counted_string (struct sfm_reader *,
245 struct text_record *);
246 static size_t text_pos (const struct text_record *);
248 static bool close_reader (struct sfm_reader *r);
250 /* Dictionary reader. */
258 static void read_header (struct sfm_reader *, struct sfm_read_info *,
259 struct sfm_header_record *);
260 static void parse_header (struct sfm_reader *,
261 const struct sfm_header_record *,
262 struct sfm_read_info *, struct dictionary *);
263 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
264 struct sfm_var_record *, size_t n);
265 static void parse_format_spec (struct sfm_reader *, off_t pos,
266 unsigned int format, enum which_format,
267 struct variable *, int *format_warning_cnt);
268 static void parse_document (struct dictionary *, struct sfm_document_record *);
269 static void parse_display_parameters (struct sfm_reader *,
270 const struct sfm_extension_record *,
271 struct dictionary *);
272 static void parse_machine_integer_info (struct sfm_reader *,
273 const struct sfm_extension_record *,
274 struct sfm_read_info *);
275 static void parse_machine_float_info (struct sfm_reader *,
276 const struct sfm_extension_record *);
277 static void parse_mrsets (struct sfm_reader *,
278 const struct sfm_extension_record *,
279 struct dictionary *);
280 static void parse_long_var_name_map (struct sfm_reader *,
281 const struct sfm_extension_record *,
282 struct dictionary *);
283 static void parse_long_string_map (struct sfm_reader *,
284 const struct sfm_extension_record *,
285 struct dictionary *);
286 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
287 const struct sfm_var_record *,
289 const struct sfm_value_label_record *);
290 static void parse_data_file_attributes (struct sfm_reader *,
291 const struct sfm_extension_record *,
292 struct dictionary *);
293 static void parse_variable_attributes (struct sfm_reader *,
294 const struct sfm_extension_record *,
295 struct dictionary *);
296 static void parse_long_string_value_labels (struct sfm_reader *,
297 const struct sfm_extension_record *,
298 struct dictionary *);
300 /* Frees the strings inside INFO. */
302 sfm_read_info_destroy (struct sfm_read_info *info)
306 free (info->creation_date);
307 free (info->creation_time);
308 free (info->product);
312 /* Opens the system file designated by file handle FH for reading. Reads the
313 system file's dictionary into *DICT.
315 If INFO is non-null, then it receives additional info about the system file,
316 which the caller must eventually free with sfm_read_info_destroy() when it
317 is no longer needed. */
319 sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
320 struct sfm_read_info *infop)
322 struct sfm_reader *volatile r = NULL;
323 struct sfm_read_info info;
325 struct sfm_header_record header;
327 struct sfm_var_record *vars;
328 size_t n_vars, allocated_vars;
330 struct sfm_value_label_record *labels;
331 size_t n_labels, allocated_labels;
333 struct sfm_document_record *document;
335 struct sfm_extension_record *extensions[32];
337 struct dictionary *dict = NULL;
340 /* Create and initialize reader. */
341 r = pool_create_container (struct sfm_reader, pool);
347 r->opcode_idx = sizeof r->opcodes;
348 r->corruption_warning = false;
350 memset (&info, 0, sizeof info);
352 /* TRANSLATORS: this fragment will be interpolated into
353 messages in fh_lock() that identify types of files. */
354 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
358 r->file = fn_open (fh_get_file_name (fh), "rb");
361 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
362 fh_get_file_name (r->fh), strerror (errno));
366 if (setjmp (r->bail_out))
370 read_header (r, &info, &header);
373 n_vars = allocated_vars = 0;
376 n_labels = allocated_labels = 0;
380 memset (extensions, 0, sizeof extensions);
390 read_int (r); /* Skip filler. */
397 if (n_vars >= allocated_vars)
398 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
400 read_variable_record (r, &vars[n_vars++]);
404 if (n_labels >= allocated_labels)
405 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
407 read_value_label_record (r, &labels[n_labels++], n_vars);
411 /* A Type 4 record is always immediately after a type 3 record,
412 so the code for type 3 records reads the type 4 record too. */
413 sys_error (r, r->pos, _("Misplaced type 4 record."));
416 if (document != NULL)
417 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
418 document = read_document_record (r);
422 subtype = read_int (r);
423 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
426 _("Unrecognized record type 7, subtype %d. Please "
427 "send a copy of this file, and the syntax which "
428 "created it to %s."),
429 subtype, PACKAGE_BUGREPORT);
430 skip_extension_record (r, subtype);
432 else if (extensions[subtype] != NULL)
435 _("Record type 7, subtype %d found here has the same "
436 "type as the record found near offset 0x%llx. "
437 "Please send a copy of this file, and the syntax "
438 "which created it to %s."),
439 subtype, (long long int) extensions[subtype]->pos,
441 skip_extension_record (r, subtype);
444 extensions[subtype] = read_extension_record (r, subtype);
448 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
453 /* Now actually parse what we read.
455 First, figure out the correct character encoding, because this determines
456 how the rest of the header data is to be interpreted. */
457 dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER],
458 extensions[EXT_ENCODING]));
459 r->encoding = dict_get_encoding (dict);
461 /* These records don't use variables at all. */
462 if (document != NULL)
463 parse_document (dict, document);
465 if (extensions[EXT_INTEGER] != NULL)
466 parse_machine_integer_info (r, extensions[EXT_INTEGER], &info);
468 if (extensions[EXT_FLOAT] != NULL)
469 parse_machine_float_info (r, extensions[EXT_FLOAT]);
471 if (extensions[EXT_FILE_ATTRS] != NULL)
472 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
474 parse_header (r, &header, &info, dict);
476 /* Parse the variable records, the basis of almost everything else. */
477 parse_variable_records (r, dict, vars, n_vars);
479 /* Parse value labels and the weight variable immediately after the variable
480 records. These records use indexes into var_recs[], so we must parse them
481 before those indexes become invalidated by very long string variables. */
482 for (i = 0; i < n_labels; i++)
483 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
484 if (header.weight_idx != 0)
486 struct variable *weight_var;
488 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
490 if (var_is_numeric (weight_var))
491 dict_set_weight (dict, weight_var);
493 sys_error (r, -1, _("Weighting variable must be numeric "
494 "(not string variable `%s')."),
495 var_get_name (weight_var));
498 if (extensions[EXT_DISPLAY] != NULL)
499 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
501 /* The following records use short names, so they need to be parsed before
502 parse_long_var_name_map() changes short names to long names. */
503 if (extensions[EXT_MRSETS] != NULL)
504 parse_mrsets (r, extensions[EXT_MRSETS], dict);
506 if (extensions[EXT_MRSETS2] != NULL)
507 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
509 if (extensions[EXT_LONG_STRINGS] != NULL)
510 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
512 /* Now rename variables to their long names. */
513 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
515 /* The following records use long names, so they need to follow renaming. */
516 if (extensions[EXT_VAR_ATTRS] != NULL)
517 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
519 if (extensions[EXT_LONG_LABELS] != NULL)
520 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
522 /* Warn if the actual amount of data per case differs from the
523 amount that the header claims. SPSS version 13 gets this
524 wrong when very long strings are involved, so don't warn in
526 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
527 && info.version_major != 13)
528 sys_warn (r, -1, _("File header claims %d variable positions but "
529 "%zu were read from file."),
530 header.nominal_case_size, n_vars);
532 /* Create an index of dictionary variable widths for
533 sfm_read_case to use. We cannot use the `struct variable's
534 from the dictionary we created, because the caller owns the
535 dictionary and may destroy or modify its variables. */
536 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
537 pool_register (r->pool, free, r->sfm_vars);
538 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
544 sfm_read_info_destroy (&info);
546 return casereader_create_sequential
548 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
549 &sys_file_casereader_class, r);
552 sfm_read_info_destroy (&info);
559 /* Closes a system file after we're done with it.
560 Returns true if an I/O error has occurred on READER, false
563 close_reader (struct sfm_reader *r)
572 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
574 msg (ME, _("Error closing system file `%s': %s."),
575 fh_get_file_name (r->fh), strerror (errno));
585 pool_destroy (r->pool);
590 /* Destroys READER. */
592 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
594 struct sfm_reader *r = r_;
598 /* Returns true if FILE is an SPSS system file,
601 sfm_detect (FILE *file)
605 if (fread (magic, 4, 1, file) != 1)
609 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
612 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
613 except for the string fields in *INFO, which parse_header() will initialize
614 later once the file's encoding is known. */
616 read_header (struct sfm_reader *r, struct sfm_read_info *info,
617 struct sfm_header_record *header)
619 uint8_t raw_layout_code[4];
622 read_string (r, header->magic, sizeof header->magic);
623 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
625 if (strcmp (ASCII_MAGIC, header->magic)
626 && strcmp (EBCDIC_MAGIC, header->magic))
627 sys_error (r, 0, _("This is not an SPSS system file."));
629 /* Identify integer format. */
630 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
631 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
633 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
635 || (r->integer_format != INTEGER_MSB_FIRST
636 && r->integer_format != INTEGER_LSB_FIRST))
637 sys_error (r, 64, _("This is not an SPSS system file."));
639 header->nominal_case_size = read_int (r);
640 if (header->nominal_case_size < 0
641 || header->nominal_case_size > INT_MAX / 16)
642 header->nominal_case_size = -1;
644 r->compressed = read_int (r) != 0;
646 header->weight_idx = read_int (r);
648 r->case_cnt = read_int (r);
649 if ( r->case_cnt > INT_MAX / 2)
652 /* Identify floating-point format and obtain compression bias. */
653 read_bytes (r, raw_bias, sizeof raw_bias);
654 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
656 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
658 if (memcmp (raw_bias, zero_bias, 8))
659 sys_warn (r, r->pos - 8,
660 _("Compression bias is not the usual "
661 "value of 100, or system file uses unrecognized "
662 "floating-point format."));
665 /* Some software is known to write all-zeros to this
666 field. Such software also writes floating-point
667 numbers in the format that we expect by default
668 (it seems that all software most likely does, in
669 reality), so don't warn in this case. */
672 if (r->integer_format == INTEGER_MSB_FIRST)
673 r->float_format = FLOAT_IEEE_DOUBLE_BE;
675 r->float_format = FLOAT_IEEE_DOUBLE_LE;
677 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
679 read_string (r, header->creation_date, sizeof header->creation_date);
680 read_string (r, header->creation_time, sizeof header->creation_time);
681 read_string (r, header->file_label, sizeof header->file_label);
684 info->integer_format = r->integer_format;
685 info->float_format = r->float_format;
686 info->compressed = r->compressed;
687 info->case_cnt = r->case_cnt;
690 /* Reads a variable (type 2) record from R into RECORD. */
692 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
694 int has_variable_label;
696 memset (record, 0, sizeof *record);
698 record->pos = r->pos;
699 record->width = read_int (r);
700 has_variable_label = read_int (r);
701 record->missing_value_code = read_int (r);
702 record->print_format = read_int (r);
703 record->write_format = read_int (r);
704 read_bytes (r, record->name, sizeof record->name);
706 if (has_variable_label == 1)
708 enum { MAX_LABEL_LEN = 255 };
709 size_t len, read_len;
713 /* Read up to MAX_LABEL_LEN bytes of label. */
714 read_len = MIN (MAX_LABEL_LEN, len);
715 record->label = pool_malloc (r->pool, read_len + 1);
716 read_string (r, record->label, read_len + 1);
718 /* Skip unread label bytes. */
719 skip_bytes (r, len - read_len);
721 /* Skip label padding up to multiple of 4 bytes. */
722 skip_bytes (r, ROUND_UP (len, 4) - len);
724 else if (has_variable_label != 0)
725 sys_error (r, record->pos,
726 _("Variable label indicator field is not 0 or 1."));
728 /* Set missing values. */
729 if (record->missing_value_code != 0)
731 int code = record->missing_value_code;
732 if (record->width == 0)
734 if (code < -3 || code > 3 || code == -1)
735 sys_error (r, record->pos,
736 _("Numeric missing value indicator field is not "
737 "-3, -2, 0, 1, 2, or 3."));
741 if (code < 1 || code > 3)
742 sys_error (r, record->pos,
743 _("String missing value indicator field is not "
747 read_bytes (r, record->missing, 8 * abs (code));
751 /* Reads value labels from R into RECORD. */
753 read_value_label_record (struct sfm_reader *r,
754 struct sfm_value_label_record *record,
759 /* Read type 3 record. */
760 record->pos = r->pos;
761 record->n_labels = read_int (r);
762 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
763 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
765 record->labels = pool_nmalloc (r->pool, record->n_labels,
766 sizeof *record->labels);
767 for (i = 0; i < record->n_labels; i++)
769 struct sfm_value_label *label = &record->labels[i];
770 unsigned char label_len;
773 read_bytes (r, label->value, sizeof label->value);
775 /* Read label length. */
776 read_bytes (r, &label_len, sizeof label_len);
777 padded_len = ROUND_UP (label_len + 1, 8);
779 /* Read label, padding. */
780 label->label = pool_malloc (r->pool, padded_len + 1);
781 read_bytes (r, label->label, padded_len - 1);
782 label->label[label_len] = '\0';
785 /* Read record type of type 4 record. */
786 if (read_int (r) != 4)
787 sys_error (r, r->pos - 4,
788 _("Variable index record (type 4) does not immediately "
789 "follow value label record (type 3) as it should."));
791 /* Read number of variables associated with value label from type 4
793 record->n_vars = read_int (r);
794 if (record->n_vars < 1 || record->n_vars > n_vars)
795 sys_error (r, r->pos - 4,
796 _("Number of variables associated with a value label (%zu) "
797 "is not between 1 and the number of variables (%zu)."),
798 record->n_vars, n_vars);
799 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
800 for (i = 0; i < record->n_vars; i++)
801 record->vars[i] = read_int (r);
804 /* Reads a document record from R and returns it. */
805 static struct sfm_document_record *
806 read_document_record (struct sfm_reader *r)
808 struct sfm_document_record *record;
811 record = pool_malloc (r->pool, sizeof *record);
812 record->pos = r->pos;
814 n_lines = read_int (r);
815 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
816 sys_error (r, record->pos,
817 _("Number of document lines (%d) "
818 "must be greater than 0 and less than %d."),
819 n_lines, INT_MAX / DOC_LINE_LENGTH);
821 record->n_lines = n_lines;
822 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
823 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
829 read_extension_record_header (struct sfm_reader *r, int subtype,
830 struct sfm_extension_record *record)
832 record->pos = r->pos;
833 record->size = read_int (r);
834 record->count = read_int (r);
836 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
837 allows an extra byte for a null terminator, used by some
838 extension processing routines. */
839 if (record->size != 0
840 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
841 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
844 /* Reads an extension record from R into RECORD. */
845 static struct sfm_extension_record *
846 read_extension_record (struct sfm_reader *r, int subtype)
848 struct extension_record_type
855 static const struct extension_record_type types[] =
857 /* Implemented record types. */
858 { EXT_INTEGER, 4, 8 },
860 { EXT_MRSETS, 1, 0 },
861 { EXT_DISPLAY, 4, 0 },
862 { EXT_LONG_NAMES, 1, 0 },
863 { EXT_LONG_STRINGS, 1, 0 },
864 { EXT_NCASES, 8, 2 },
865 { EXT_FILE_ATTRS, 1, 0 },
866 { EXT_VAR_ATTRS, 1, 0 },
867 { EXT_MRSETS2, 1, 0 },
868 { EXT_ENCODING, 1, 0 },
869 { EXT_LONG_LABELS, 1, 0 },
871 /* Ignored record types. */
872 { EXT_VAR_SETS, 0, 0 },
874 { EXT_DATA_ENTRY, 0, 0 },
877 const struct extension_record_type *type;
878 struct sfm_extension_record *record;
881 record = pool_malloc (r->pool, sizeof *record);
882 read_extension_record_header (r, subtype, record);
883 n_bytes = record->count * record->size;
885 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
886 if (subtype == type->subtype)
888 if (type->size > 0 && record->size != type->size)
889 sys_warn (r, record->pos,
890 _("Record type 7, subtype %d has bad size %zu "
891 "(expected %d)."), subtype, record->size, type->size);
892 else if (type->count > 0 && record->count != type->count)
893 sys_warn (r, record->pos,
894 _("Record type 7, subtype %d has bad count %zu "
895 "(expected %d)."), subtype, record->count, type->count);
896 else if (type->count == 0 && type->size == 0)
898 /* Ignore this record. */
902 char *data = pool_malloc (r->pool, n_bytes + 1);
903 data[n_bytes] = '\0';
906 read_bytes (r, record->data, n_bytes);
913 sys_warn (r, record->pos,
914 _("Unrecognized record type 7, subtype %d. Please send a "
915 "copy of this file, and the syntax which created it to %s."),
916 subtype, PACKAGE_BUGREPORT);
919 skip_bytes (r, n_bytes);
924 skip_extension_record (struct sfm_reader *r, int subtype)
926 struct sfm_extension_record record;
928 read_extension_record_header (r, subtype, &record);
929 skip_bytes (r, record.count * record.size);
933 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
934 struct sfm_read_info *info, struct dictionary *dict)
936 const char *dict_encoding = dict_get_encoding (dict);
937 struct substring product;
938 struct substring label;
940 /* Convert file label to UTF-8 and put it into DICT. */
941 label = recode_substring_pool ("UTF-8", dict_encoding,
942 ss_cstr (header->file_label), r->pool);
943 ss_trim (&label, ss_cstr (" "));
944 label.string[label.length] = '\0';
945 dict_set_label (dict, label.string);
947 /* Put creation date and time in UTF-8 into INFO. */
948 info->creation_date = recode_string ("UTF-8", dict_encoding,
949 header->creation_date, -1);
950 info->creation_time = recode_string ("UTF-8", dict_encoding,
951 header->creation_time, -1);
953 /* Put product name into INFO, dropping eye-catcher string if present. */
954 product = recode_substring_pool ("UTF-8", dict_encoding,
955 ss_cstr (header->eye_catcher), r->pool);
956 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
957 ss_trim (&product, ss_cstr (" "));
958 info->product = ss_xstrdup (product);
961 /* Reads a variable (type 2) record from R and adds the
962 corresponding variable to DICT.
963 Also skips past additional variable records for long string
966 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
967 struct sfm_var_record *var_recs, size_t n_var_recs)
969 const char *dict_encoding = dict_get_encoding (dict);
970 struct sfm_var_record *rec;
973 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
975 struct variable *var;
980 name = recode_string_pool ("UTF-8", dict_encoding,
981 rec->name, 8, r->pool);
982 name[strcspn (name, " ")] = '\0';
984 if (!dict_id_is_valid (dict, name, false)
985 || name[0] == '$' || name[0] == '#')
986 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
988 if (rec->width < 0 || rec->width > 255)
989 sys_error (r, rec->pos,
990 _("Bad width %d for variable %s."), rec->width, name);
992 var = rec->var = dict_create_var (dict, name, rec->width);
994 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
996 /* Set the short name the same as the long name. */
997 var_set_short_name (var, 0, name);
999 /* Get variable label, if any. */
1004 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1005 rec->label, -1, r->pool);
1006 var_set_label (var, utf8_label, false);
1009 /* Set missing values. */
1010 if (rec->missing_value_code != 0)
1012 int width = var_get_width (var);
1013 struct missing_values mv;
1015 mv_init_pool (r->pool, &mv, width);
1016 if (var_is_numeric (var))
1018 bool has_range = rec->missing_value_code < 0;
1019 int n_discrete = (has_range
1020 ? rec->missing_value_code == -3
1021 : rec->missing_value_code);
1026 double low = parse_float (r, rec->missing, 0);
1027 double high = parse_float (r, rec->missing, 8);
1028 mv_add_range (&mv, low, high);
1032 for (i = 0; i < n_discrete; i++)
1034 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1042 value_init_pool (r->pool, &value, width);
1043 value_set_missing (&value, width);
1044 for (i = 0; i < rec->missing_value_code; i++)
1046 uint8_t *s = value_str_rw (&value, width);
1047 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1048 mv_add_str (&mv, s);
1051 var_set_missing_values (var, &mv);
1055 parse_format_spec (r, rec->pos + 12, rec->print_format,
1056 PRINT_FORMAT, var, &n_warnings);
1057 parse_format_spec (r, rec->pos + 16, rec->write_format,
1058 WRITE_FORMAT, var, &n_warnings);
1060 /* Account for values.
1061 Skip long string continuation records, if any. */
1062 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1063 for (i = 1; i < n_values; i++)
1064 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1065 sys_error (r, rec->pos, _("Missing string continuation record."));
1070 /* Translates the format spec from sysfile format to internal
1073 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1074 enum which_format which, struct variable *v,
1077 const int max_warnings = 8;
1078 uint8_t raw_type = format >> 16;
1079 uint8_t w = format >> 8;
1088 ok = (fmt_from_io (raw_type, &f.type)
1089 && fmt_check_output (&f)
1090 && fmt_check_width_compat (&f, var_get_width (v)));
1095 if (which == PRINT_FORMAT)
1096 var_set_print_format (v, &f);
1098 var_set_write_format (v, &f);
1100 else if (format == 0)
1102 /* Actually observed in the wild. No point in warning about it. */
1104 else if (++*n_warnings <= max_warnings)
1106 if (which == PRINT_FORMAT)
1107 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1109 var_get_name (v), var_get_width (v), format);
1111 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1113 var_get_name (v), var_get_width (v), format);
1115 if (*n_warnings == max_warnings)
1116 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1121 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1125 for (p = record->documents;
1126 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1127 p += DOC_LINE_LENGTH)
1129 struct substring line;
1131 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1132 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1133 ss_rtrim (&line, ss_cstr (" "));
1134 line.string[line.length] = '\0';
1136 dict_add_document_line (dict, line.string, false);
1142 /* Parses record type 7, subtype 3. */
1144 parse_machine_integer_info (struct sfm_reader *r,
1145 const struct sfm_extension_record *record,
1146 struct sfm_read_info *info)
1148 int float_representation, expected_float_format;
1149 int integer_representation, expected_integer_format;
1151 /* Save version info. */
1152 info->version_major = parse_int (r, record->data, 0);
1153 info->version_minor = parse_int (r, record->data, 4);
1154 info->version_revision = parse_int (r, record->data, 8);
1156 /* Check floating point format. */
1157 float_representation = parse_int (r, record->data, 16);
1158 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1159 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1160 expected_float_format = 1;
1161 else if (r->float_format == FLOAT_Z_LONG)
1162 expected_float_format = 2;
1163 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1164 expected_float_format = 3;
1167 if (float_representation != expected_float_format)
1168 sys_error (r, record->pos, _("Floating-point representation indicated by "
1169 "system file (%d) differs from expected (%d)."),
1170 float_representation, expected_float_format);
1172 /* Check integer format. */
1173 integer_representation = parse_int (r, record->data, 24);
1174 if (r->integer_format == INTEGER_MSB_FIRST)
1175 expected_integer_format = 1;
1176 else if (r->integer_format == INTEGER_LSB_FIRST)
1177 expected_integer_format = 2;
1180 if (integer_representation != expected_integer_format)
1181 sys_warn (r, record->pos,
1182 _("Integer format indicated by system file (%d) "
1183 "differs from expected (%d)."),
1184 integer_representation, expected_integer_format);
1189 choose_encoding (struct sfm_reader *r,
1190 const struct sfm_header_record *header,
1191 const struct sfm_extension_record *ext_integer,
1192 const struct sfm_extension_record *ext_encoding)
1194 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1197 return ext_encoding->data;
1199 /* But EXT_INTEGER is better than nothing as a fallback. */
1202 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1203 const char *encoding;
1212 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1213 respectively. However, there are known to be many files in the wild
1214 with character code 2, yet have data which are clearly not ASCII.
1215 Therefore we ignore these values. */
1222 encoding = sys_get_encoding_from_codepage (codepage);
1223 if (encoding != NULL)
1229 /* If the file magic number is EBCDIC then its character data is too. */
1230 if (!strcmp (header->magic, EBCDIC_MAGIC))
1233 return locale_charset ();
1236 /* Parses record type 7, subtype 4. */
1238 parse_machine_float_info (struct sfm_reader *r,
1239 const struct sfm_extension_record *record)
1241 double sysmis = parse_float (r, record->data, 0);
1242 double highest = parse_float (r, record->data, 8);
1243 double lowest = parse_float (r, record->data, 16);
1245 if (sysmis != SYSMIS)
1246 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1249 if (highest != HIGHEST)
1250 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1251 highest, "HIGHEST");
1253 if (lowest != LOWEST)
1254 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1258 /* Parses record type 7, subtype 7 or 19. */
1260 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1261 struct dictionary *dict)
1263 struct text_record *text;
1264 struct mrset *mrset;
1266 text = open_text_record (r, record, false);
1269 const char *counted = NULL;
1272 struct stringi_set var_names;
1273 size_t allocated_vars;
1277 mrset = xzalloc (sizeof *mrset);
1279 name = text_get_token (text, ss_cstr ("="), NULL);
1282 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1284 if (mrset->name[0] != '$')
1286 sys_warn (r, record->pos,
1287 _("`%s' does not begin with `$' at offset %zu "
1288 "in MRSETS record."), mrset->name, text_pos (text));
1292 if (text_match (text, 'C'))
1294 mrset->type = MRSET_MC;
1295 if (!text_match (text, ' '))
1297 sys_warn (r, record->pos,
1298 _("Missing space following `%c' at offset %zu "
1299 "in MRSETS record."), 'C', text_pos (text));
1303 else if (text_match (text, 'D'))
1305 mrset->type = MRSET_MD;
1306 mrset->cat_source = MRSET_VARLABELS;
1308 else if (text_match (text, 'E'))
1312 mrset->type = MRSET_MD;
1313 mrset->cat_source = MRSET_COUNTEDVALUES;
1314 if (!text_match (text, ' '))
1316 sys_warn (r, record->pos,
1317 _("Missing space following `%c' at offset %zu "
1318 "in MRSETS record."), 'E', text_pos (text));
1322 number = text_get_token (text, ss_cstr (" "), NULL);
1323 if (!strcmp (number, "11"))
1324 mrset->label_from_var_label = true;
1325 else if (strcmp (number, "1"))
1326 sys_warn (r, record->pos,
1327 _("Unexpected label source value `%s' following `E' "
1328 "at offset %zu in MRSETS record."),
1329 number, text_pos (text));
1333 sys_warn (r, record->pos,
1334 _("Missing `C', `D', or `E' at offset %zu "
1335 "in MRSETS record."),
1340 if (mrset->type == MRSET_MD)
1342 counted = text_parse_counted_string (r, text);
1343 if (counted == NULL)
1347 label = text_parse_counted_string (r, text);
1350 if (label[0] != '\0')
1351 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1353 stringi_set_init (&var_names);
1358 const char *raw_var_name;
1359 struct variable *var;
1362 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1363 if (raw_var_name == NULL)
1365 sys_warn (r, record->pos,
1366 _("Missing new-line parsing variable names "
1367 "at offset %zu in MRSETS record."),
1371 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1373 var = dict_lookup_var (dict, var_name);
1379 if (!stringi_set_insert (&var_names, var_name))
1381 sys_warn (r, record->pos,
1382 _("Duplicate variable name %s "
1383 "at offset %zu in MRSETS record."),
1384 var_name, text_pos (text));
1390 if (mrset->label == NULL && mrset->label_from_var_label
1391 && var_has_label (var))
1392 mrset->label = xstrdup (var_get_label (var));
1395 && var_get_type (var) != var_get_type (mrset->vars[0]))
1397 sys_warn (r, record->pos,
1398 _("MRSET %s contains both string and "
1399 "numeric variables."), name);
1402 width = MIN (width, var_get_width (var));
1404 if (mrset->n_vars >= allocated_vars)
1405 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1406 sizeof *mrset->vars);
1407 mrset->vars[mrset->n_vars++] = var;
1409 while (delimiter != '\n');
1411 if (mrset->n_vars < 2)
1413 sys_warn (r, record->pos,
1414 _("MRSET %s has only %zu variables."), mrset->name,
1416 mrset_destroy (mrset);
1420 if (mrset->type == MRSET_MD)
1422 mrset->width = width;
1423 value_init (&mrset->counted, width);
1425 mrset->counted.f = strtod (counted, NULL);
1427 value_copy_str_rpad (&mrset->counted, width,
1428 (const uint8_t *) counted, ' ');
1431 dict_add_mrset (dict, mrset);
1433 stringi_set_destroy (&var_names);
1435 mrset_destroy (mrset);
1436 close_text_record (r, text);
1439 /* Read record type 7, subtype 11, which specifies how variables
1440 should be displayed in GUI environments. */
1442 parse_display_parameters (struct sfm_reader *r,
1443 const struct sfm_extension_record *record,
1444 struct dictionary *dict)
1446 bool includes_width;
1447 bool warned = false;
1452 n_vars = dict_get_var_cnt (dict);
1453 if (record->count == 3 * n_vars)
1454 includes_width = true;
1455 else if (record->count == 2 * n_vars)
1456 includes_width = false;
1459 sys_warn (r, record->pos,
1460 _("Extension 11 has bad count %zu (for %zu variables)."),
1461 record->count, n_vars);
1466 for (i = 0; i < n_vars; ++i)
1468 struct variable *v = dict_get_var (dict, i);
1469 int measure, width, align;
1471 measure = parse_int (r, record->data, ofs);
1476 width = parse_int (r, record->data, ofs);
1482 align = parse_int (r, record->data, ofs);
1485 /* SPSS 14 sometimes seems to set string variables' measure
1487 if (0 == measure && var_is_alpha (v))
1490 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1493 sys_warn (r, record->pos,
1494 _("Invalid variable display parameters for variable "
1495 "%zu (%s). Default parameters substituted."),
1496 i, var_get_name (v));
1501 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1502 : measure == 2 ? MEASURE_ORDINAL
1504 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1505 : align == 1 ? ALIGN_RIGHT
1508 /* Older versions (SPSS 9.0) sometimes set the display
1509 width to zero. This causes confusion in the GUI, so
1510 only set the width if it is nonzero. */
1512 var_set_display_width (v, width);
1517 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1518 const char *new_name)
1520 size_t n_short_names;
1524 /* Renaming a variable may clear its short names, but we
1525 want to retain them, so we save them and re-set them
1527 n_short_names = var_get_short_name_cnt (var);
1528 short_names = xnmalloc (n_short_names, sizeof *short_names);
1529 for (i = 0; i < n_short_names; i++)
1531 const char *s = var_get_short_name (var, i);
1532 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1535 /* Set long name. */
1536 dict_rename_var (dict, var, new_name);
1538 /* Restore short names. */
1539 for (i = 0; i < n_short_names; i++)
1541 var_set_short_name (var, i, short_names[i]);
1542 free (short_names[i]);
1547 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1548 to each short name. Modifies variable names in DICT accordingly. */
1550 parse_long_var_name_map (struct sfm_reader *r,
1551 const struct sfm_extension_record *record,
1552 struct dictionary *dict)
1554 struct text_record *text;
1555 struct variable *var;
1560 /* Convert variable names to lowercase. */
1563 for (i = 0; i < dict_get_var_cnt (dict); i++)
1565 struct variable *var = dict_get_var (dict, i);
1568 new_name = xstrdup (var_get_name (var));
1569 str_lowercase (new_name);
1571 rename_var_and_save_short_names (dict, var, new_name);
1579 /* Rename each of the variables, one by one. (In a correctly constructed
1580 system file, this cannot create any intermediate duplicate variable names,
1581 because all of the new variable names are longer than any of the old
1582 variable names and thus there cannot be any overlaps.) */
1583 text = open_text_record (r, record, true);
1584 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1586 /* Validate long name. */
1587 /* XXX need to reencode name to UTF-8 */
1588 if (!dict_id_is_valid (dict, long_name, false))
1590 sys_warn (r, record->pos,
1591 _("Long variable mapping from %s to invalid "
1592 "variable name `%s'."),
1593 var_get_name (var), long_name);
1597 /* Identify any duplicates. */
1598 if (strcasecmp (var_get_short_name (var, 0), long_name)
1599 && dict_lookup_var (dict, long_name) != NULL)
1601 sys_warn (r, record->pos,
1602 _("Duplicate long variable name `%s'."), long_name);
1606 rename_var_and_save_short_names (dict, var, long_name);
1608 close_text_record (r, text);
1611 /* Reads record type 7, subtype 14, which gives the real length
1612 of each very long string. Rearranges DICT accordingly. */
1614 parse_long_string_map (struct sfm_reader *r,
1615 const struct sfm_extension_record *record,
1616 struct dictionary *dict)
1618 struct text_record *text;
1619 struct variable *var;
1622 text = open_text_record (r, record, true);
1623 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1625 size_t idx = var_get_dict_index (var);
1631 length = strtol (length_s, NULL, 10);
1632 if (length < 1 || length > MAX_STRING)
1634 sys_warn (r, record->pos,
1635 _("%s listed as string of invalid length %s "
1636 "in very long string record."),
1637 var_get_name (var), length_s);
1641 /* Check segments. */
1642 segment_cnt = sfm_width_to_segments (length);
1643 if (segment_cnt == 1)
1645 sys_warn (r, record->pos,
1646 _("%s listed in very long string record with width %s, "
1647 "which requires only one segment."),
1648 var_get_name (var), length_s);
1651 if (idx + segment_cnt > dict_get_var_cnt (dict))
1652 sys_error (r, record->pos,
1653 _("Very long string %s overflows dictionary."),
1654 var_get_name (var));
1656 /* Get the short names from the segments and check their
1658 for (i = 0; i < segment_cnt; i++)
1660 struct variable *seg = dict_get_var (dict, idx + i);
1661 int alloc_width = sfm_segment_alloc_width (length, i);
1662 int width = var_get_width (seg);
1665 var_set_short_name (var, i, var_get_short_name (seg, 0));
1666 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1667 sys_error (r, record->pos,
1668 _("Very long string with width %ld has segment %d "
1669 "of width %d (expected %d)."),
1670 length, i, width, alloc_width);
1672 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1673 var_set_width (var, length);
1675 close_text_record (r, text);
1676 dict_compact_values (dict);
1680 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1681 const struct sfm_var_record *var_recs, size_t n_var_recs,
1682 const struct sfm_value_label_record *record)
1684 struct variable **vars;
1688 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1689 for (i = 0; i < record->n_labels; i++)
1690 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1691 record->labels[i].label, -1,
1694 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1695 for (i = 0; i < record->n_vars; i++)
1696 vars[i] = lookup_var_by_index (r, record->pos,
1697 var_recs, n_var_recs, record->vars[i]);
1699 for (i = 1; i < record->n_vars; i++)
1700 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1701 sys_error (r, record->pos,
1702 _("Variables associated with value label are not all of "
1703 "identical type. Variable %s is %s, but variable "
1705 var_get_name (vars[0]),
1706 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1707 var_get_name (vars[i]),
1708 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1710 for (i = 0; i < record->n_vars; i++)
1712 struct variable *var = vars[i];
1716 width = var_get_width (var);
1718 sys_error (r, record->pos,
1719 _("Value labels may not be added to long string "
1720 "variables (e.g. %s) using records types 3 and 4."),
1721 var_get_name (var));
1723 for (j = 0; j < record->n_labels; j++)
1725 struct sfm_value_label *label = &record->labels[j];
1728 value_init (&value, width);
1730 value.f = parse_float (r, label->value, 0);
1732 memcpy (value_str_rw (&value, width), label->value, width);
1734 if (!var_add_value_label (var, &value, utf8_labels[j]))
1736 if (var_is_numeric (var))
1737 sys_warn (r, record->pos,
1738 _("Duplicate value label for %g on %s."),
1739 value.f, var_get_name (var));
1741 sys_warn (r, record->pos,
1742 _("Duplicate value label for `%.*s' on %s."),
1743 width, value_str (&value, width),
1744 var_get_name (var));
1747 value_destroy (&value, width);
1751 pool_free (r->pool, vars);
1752 for (i = 0; i < record->n_labels; i++)
1753 pool_free (r->pool, utf8_labels[i]);
1754 pool_free (r->pool, utf8_labels);
1757 static struct variable *
1758 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1759 const struct sfm_var_record *var_recs, size_t n_var_recs,
1762 const struct sfm_var_record *rec;
1764 if (idx < 1 || idx > n_var_recs)
1766 sys_error (r, offset,
1767 _("Variable index %d not in valid range 1...%zu."),
1772 rec = &var_recs[idx - 1];
1773 if (rec->var == NULL)
1775 sys_error (r, offset,
1776 _("Variable index %d refers to long string continuation."),
1784 /* Parses a set of custom attributes from TEXT into ATTRS.
1785 ATTRS may be a null pointer, in which case the attributes are
1786 read but discarded. */
1788 parse_attributes (struct sfm_reader *r, struct text_record *text,
1789 struct attrset *attrs)
1793 struct attribute *attr;
1797 /* Parse the key. */
1798 key = text_get_token (text, ss_cstr ("("), NULL);
1802 attr = attribute_create (key);
1803 for (index = 1; ; index++)
1805 /* Parse the value. */
1809 value = text_get_token (text, ss_cstr ("\n"), NULL);
1812 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1817 length = strlen (value);
1818 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1820 value[length - 1] = '\0';
1821 attribute_add_value (attr, value + 1);
1826 _("Attribute value %s[%d] is not quoted: %s."),
1828 attribute_add_value (attr, value);
1831 /* Was this the last value for this attribute? */
1832 if (text_match (text, ')'))
1836 attrset_add (attrs, attr);
1838 attribute_destroy (attr);
1840 while (!text_match (text, '/'));
1843 /* Reads record type 7, subtype 17, which lists custom
1844 attributes on the data file. */
1846 parse_data_file_attributes (struct sfm_reader *r,
1847 const struct sfm_extension_record *record,
1848 struct dictionary *dict)
1850 struct text_record *text = open_text_record (r, record, true);
1851 parse_attributes (r, text, dict_get_attributes (dict));
1852 close_text_record (r, text);
1855 /* Parses record type 7, subtype 18, which lists custom
1856 attributes on individual variables. */
1858 parse_variable_attributes (struct sfm_reader *r,
1859 const struct sfm_extension_record *record,
1860 struct dictionary *dict)
1862 struct text_record *text;
1863 struct variable *var;
1865 text = open_text_record (r, record, true);
1866 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1867 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1868 close_text_record (r, text);
1872 check_overflow (struct sfm_reader *r,
1873 const struct sfm_extension_record *record,
1874 size_t ofs, size_t length)
1876 size_t end = record->size * record->count;
1877 if (length >= end || ofs + length > end)
1878 sys_error (r, record->pos + end,
1879 _("Long string value label record ends unexpectedly."));
1883 parse_long_string_value_labels (struct sfm_reader *r,
1884 const struct sfm_extension_record *record,
1885 struct dictionary *dict)
1887 const char *dict_encoding = dict_get_encoding (dict);
1888 size_t end = record->size * record->count;
1895 struct variable *var;
1900 /* Parse variable name length. */
1901 check_overflow (r, record, ofs, 4);
1902 var_name_len = parse_int (r, record->data, ofs);
1905 /* Parse variable name, width, and number of labels. */
1906 check_overflow (r, record, ofs, var_name_len + 8);
1907 var_name = recode_string_pool ("UTF-8", dict_encoding,
1908 (const char *) record->data + ofs,
1909 var_name_len, r->pool);
1910 width = parse_int (r, record->data, ofs + var_name_len);
1911 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1912 ofs += var_name_len + 8;
1914 /* Look up 'var' and validate. */
1915 var = dict_lookup_var (dict, var_name);
1917 sys_warn (r, record->pos + ofs,
1918 _("Ignoring long string value record for "
1919 "unknown variable %s."), var_name);
1920 else if (var_is_numeric (var))
1922 sys_warn (r, record->pos + ofs,
1923 _("Ignoring long string value record for "
1924 "numeric variable %s."), var_name);
1927 else if (width != var_get_width (var))
1929 sys_warn (r, record->pos + ofs,
1930 _("Ignoring long string value record for variable %s "
1931 "because the record's width (%d) does not match the "
1932 "variable's width (%d)."),
1933 var_name, width, var_get_width (var));
1938 value_init_pool (r->pool, &value, width);
1939 for (i = 0; i < n_labels; i++)
1941 size_t value_length, label_length;
1942 bool skip = var == NULL;
1944 /* Parse value length. */
1945 check_overflow (r, record, ofs, 4);
1946 value_length = parse_int (r, record->data, ofs);
1950 check_overflow (r, record, ofs, value_length);
1953 if (value_length == width)
1954 memcpy (value_str_rw (&value, width),
1955 (const uint8_t *) record->data + ofs, width);
1958 sys_warn (r, record->pos + ofs,
1959 _("Ignoring long string value %zu for variable "
1960 "%s, with width %d, that has bad value "
1962 i, var_get_name (var), width, value_length);
1966 ofs += value_length;
1968 /* Parse label length. */
1969 check_overflow (r, record, ofs, 4);
1970 label_length = parse_int (r, record->data, ofs);
1974 check_overflow (r, record, ofs, label_length);
1979 label = recode_string_pool ("UTF-8", dict_encoding,
1980 (const char *) record->data + ofs,
1981 label_length, r->pool);
1982 if (!var_add_value_label (var, &value, label))
1983 sys_warn (r, record->pos + ofs,
1984 _("Duplicate value label for `%.*s' on %s."),
1985 width, value_str (&value, width),
1986 var_get_name (var));
1987 pool_free (r->pool, label);
1989 ofs += label_length;
1996 static void partial_record (struct sfm_reader *r)
1999 static void read_error (struct casereader *, const struct sfm_reader *);
2001 static bool read_case_number (struct sfm_reader *, double *);
2002 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2003 static int read_opcode (struct sfm_reader *);
2004 static bool read_compressed_number (struct sfm_reader *, double *);
2005 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2006 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2007 static bool skip_whole_strings (struct sfm_reader *, size_t);
2009 /* Reads and returns one case from READER's file. Returns a null
2010 pointer if not successful. */
2011 static struct ccase *
2012 sys_file_casereader_read (struct casereader *reader, void *r_)
2014 struct sfm_reader *r = r_;
2015 struct ccase *volatile c;
2021 c = case_create (r->proto);
2022 if (setjmp (r->bail_out))
2024 casereader_force_error (reader);
2029 for (i = 0; i < r->sfm_var_cnt; i++)
2031 struct sfm_var *sv = &r->sfm_vars[i];
2032 union value *v = case_data_rw_idx (c, sv->case_index);
2034 if (sv->var_width == 0)
2036 if (!read_case_number (r, &v->f))
2041 uint8_t *s = value_str_rw (v, sv->var_width);
2042 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2044 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2053 if (r->case_cnt != -1)
2054 read_error (reader, r);
2059 /* Issues an error that R ends in a partial record. */
2061 partial_record (struct sfm_reader *r)
2063 sys_error (r, r->pos, _("File ends in partial case."));
2066 /* Issues an error that an unspecified error occurred SFM, and
2069 read_error (struct casereader *r, const struct sfm_reader *sfm)
2071 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2072 casereader_force_error (r);
2075 /* Reads a number from R and stores its value in *D.
2076 If R is compressed, reads a compressed number;
2077 otherwise, reads a number in the regular way.
2078 Returns true if successful, false if end of file is
2079 reached immediately. */
2081 read_case_number (struct sfm_reader *r, double *d)
2086 if (!try_read_bytes (r, number, sizeof number))
2088 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2092 return read_compressed_number (r, d);
2095 /* Reads LENGTH string bytes from R into S.
2096 Always reads a multiple of 8 bytes; if LENGTH is not a
2097 multiple of 8, then extra bytes are read and discarded without
2099 Reads compressed strings if S is compressed.
2100 Returns true if successful, false if end of file is
2101 reached immediately. */
2103 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2105 size_t whole = ROUND_DOWN (length, 8);
2106 size_t partial = length % 8;
2110 if (!read_whole_strings (r, s, whole))
2117 if (!read_whole_strings (r, bounce, sizeof bounce))
2123 memcpy (s + whole, bounce, partial);
2129 /* Reads and returns the next compression opcode from R. */
2131 read_opcode (struct sfm_reader *r)
2133 assert (r->compressed);
2137 if (r->opcode_idx >= sizeof r->opcodes)
2139 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2143 opcode = r->opcodes[r->opcode_idx++];
2150 /* Reads a compressed number from R and stores its value in D.
2151 Returns true if successful, false if end of file is
2152 reached immediately. */
2154 read_compressed_number (struct sfm_reader *r, double *d)
2156 int opcode = read_opcode (r);
2164 *d = read_float (r);
2168 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2169 if (!r->corruption_warning)
2171 r->corruption_warning = true;
2172 sys_warn (r, r->pos,
2173 _("Possible compressed data corruption: "
2174 "compressed spaces appear in numeric field."));
2183 *d = opcode - r->bias;
2190 /* Reads a compressed 8-byte string segment from R and stores it
2192 Returns true if successful, false if end of file is
2193 reached immediately. */
2195 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2197 int opcode = read_opcode (r);
2205 read_bytes (r, dst, 8);
2209 memset (dst, ' ', 8);
2214 double value = opcode - r->bias;
2215 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2218 /* This has actually been seen "in the wild". The submitter of the
2219 file that showed that the contents decoded as spaces, but they
2220 were at the end of the field so it's possible that the null
2221 bytes just acted as null terminators. */
2223 else if (!r->corruption_warning)
2225 r->corruption_warning = true;
2226 sys_warn (r, r->pos,
2227 _("Possible compressed data corruption: "
2228 "string contains compressed integer (opcode %d)."),
2238 /* Reads LENGTH string bytes from R into S.
2239 LENGTH must be a multiple of 8.
2240 Reads compressed strings if S is compressed.
2241 Returns true if successful, false if end of file is
2242 reached immediately. */
2244 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2246 assert (length % 8 == 0);
2248 return try_read_bytes (r, s, length);
2252 for (ofs = 0; ofs < length; ofs += 8)
2253 if (!read_compressed_string (r, s + ofs))
2263 /* Skips LENGTH string bytes from R.
2264 LENGTH must be a multiple of 8.
2265 (LENGTH is also limited to 1024, but that's only because the
2266 current caller never needs more than that many bytes.)
2267 Returns true if successful, false if end of file is
2268 reached immediately. */
2270 skip_whole_strings (struct sfm_reader *r, size_t length)
2272 uint8_t buffer[1024];
2273 assert (length < sizeof buffer);
2274 return read_whole_strings (r, buffer, length);
2277 /* Helpers for reading records that contain structured text
2280 /* Maximum number of warnings to issue for a single text
2282 #define MAX_TEXT_WARNINGS 5
2287 struct substring buffer; /* Record contents. */
2288 off_t start; /* Starting offset in file. */
2289 size_t pos; /* Current position in buffer. */
2290 int n_warnings; /* Number of warnings issued or suppressed. */
2291 bool recoded; /* Recoded into UTF-8? */
2294 static struct text_record *
2295 open_text_record (struct sfm_reader *r,
2296 const struct sfm_extension_record *record,
2297 bool recode_to_utf8)
2299 struct text_record *text;
2300 struct substring raw;
2302 text = pool_alloc (r->pool, sizeof *text);
2303 raw = ss_buffer (record->data, record->size * record->count);
2304 text->start = record->pos;
2305 text->buffer = (recode_to_utf8
2306 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2309 text->n_warnings = 0;
2310 text->recoded = recode_to_utf8;
2315 /* Closes TEXT, frees its storage, and issues a final warning
2316 about suppressed warnings if necesary. */
2318 close_text_record (struct sfm_reader *r, struct text_record *text)
2320 if (text->n_warnings > MAX_TEXT_WARNINGS)
2321 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2322 text->n_warnings - MAX_TEXT_WARNINGS);
2324 pool_free (r->pool, ss_data (text->buffer));
2327 /* Reads a variable=value pair from TEXT.
2328 Looks up the variable in DICT and stores it into *VAR.
2329 Stores a null-terminated value into *VALUE. */
2331 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2332 struct text_record *text,
2333 struct variable **var, char **value)
2337 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2340 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2344 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2345 ss_buffer ("\t\0", 2));
2353 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2354 struct text_record *text, struct substring delimiters,
2355 struct variable **var)
2359 name = text_get_token (text, delimiters, NULL);
2363 *var = dict_lookup_var (dict, name);
2367 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2374 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2375 struct text_record *text, struct substring delimiters,
2376 struct variable **var)
2378 char *short_name = text_get_token (text, delimiters, NULL);
2379 if (short_name == NULL)
2382 *var = dict_lookup_var (dict, short_name);
2384 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2389 /* Displays a warning for the current file position, limiting the
2390 number to MAX_TEXT_WARNINGS for TEXT. */
2392 text_warn (struct sfm_reader *r, struct text_record *text,
2393 const char *format, ...)
2395 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2399 va_start (args, format);
2400 sys_msg (r, text->start + text->pos, MW, format, args);
2406 text_get_token (struct text_record *text, struct substring delimiters,
2409 struct substring token;
2412 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2415 end = &ss_data (token)[ss_length (token)];
2416 if (delimiter != NULL)
2419 return ss_data (token);
2422 /* Reads a integer value expressed in decimal, then a space, then a string that
2423 consists of exactly as many bytes as specified by the integer, then a space,
2424 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2425 buffer (so the caller should not free the string). */
2427 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2437 int c = text->buffer.string[text->pos];
2438 if (c < '0' || c > '9')
2440 n = (n * 10) + (c - '0');
2443 if (start == text->pos)
2445 sys_warn (r, text->start,
2446 _("Expecting digit at offset %zu in MRSETS record."),
2451 if (!text_match (text, ' '))
2453 sys_warn (r, text->start,
2454 _("Expecting space at offset %zu in MRSETS record."),
2459 if (text->pos + n > text->buffer.length)
2461 sys_warn (r, text->start,
2462 _("%zu-byte string starting at offset %zu "
2463 "exceeds record length %zu."),
2464 n, text->pos, text->buffer.length);
2468 s = &text->buffer.string[text->pos];
2471 sys_warn (r, text->start,
2472 _("Expecting space at offset %zu following %zu-byte string."),
2482 text_match (struct text_record *text, char c)
2484 if (text->buffer.string[text->pos] == c)
2493 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2494 inside the TEXT's string. */
2496 text_pos (const struct text_record *text)
2503 /* Displays a corruption message. */
2505 sys_msg (struct sfm_reader *r, off_t offset,
2506 int class, const char *format, va_list args)
2511 ds_init_empty (&text);
2513 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2514 fh_get_file_name (r->fh), (long long int) offset);
2516 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2517 ds_put_vformat (&text, format, args);
2519 m.category = msg_class_to_category (class);
2520 m.severity = msg_class_to_severity (class);
2526 m.text = ds_cstr (&text);
2531 /* Displays a warning for offset OFFSET in the file. */
2533 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2537 va_start (args, format);
2538 sys_msg (r, offset, MW, format, args);
2542 /* Displays an error for the current file position,
2543 marks it as in an error state,
2544 and aborts reading it using longjmp. */
2546 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2550 va_start (args, format);
2551 sys_msg (r, offset, ME, format, args);
2555 longjmp (r->bail_out, 1);
2558 /* Reads BYTE_CNT bytes into BUF.
2559 Returns true if exactly BYTE_CNT bytes are successfully read.
2560 Aborts if an I/O error or a partial read occurs.
2561 If EOF_IS_OK, then an immediate end-of-file causes false to be
2562 returned; otherwise, immediate end-of-file causes an abort
2565 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2566 void *buf, size_t byte_cnt)
2568 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2569 r->pos += bytes_read;
2570 if (bytes_read == byte_cnt)
2572 else if (ferror (r->file))
2573 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2574 else if (!eof_is_ok || bytes_read != 0)
2575 sys_error (r, r->pos, _("Unexpected end of file."));
2580 /* Reads BYTE_CNT into BUF.
2581 Aborts upon I/O error or if end-of-file is encountered. */
2583 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2585 read_bytes_internal (r, false, buf, byte_cnt);
2588 /* Reads BYTE_CNT bytes into BUF.
2589 Returns true if exactly BYTE_CNT bytes are successfully read.
2590 Returns false if an immediate end-of-file is encountered.
2591 Aborts if an I/O error or a partial read occurs. */
2593 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2595 return read_bytes_internal (r, true, buf, byte_cnt);
2598 /* Reads a 32-bit signed integer from R and returns its value in
2601 read_int (struct sfm_reader *r)
2604 read_bytes (r, integer, sizeof integer);
2605 return integer_get (r->integer_format, integer, sizeof integer);
2608 /* Reads a 64-bit floating-point number from R and returns its
2609 value in host format. */
2611 read_float (struct sfm_reader *r)
2614 read_bytes (r, number, sizeof number);
2615 return float_get_double (r->float_format, number);
2619 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2621 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2625 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2627 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2630 /* Reads exactly SIZE - 1 bytes into BUFFER
2631 and stores a null byte into BUFFER[SIZE - 1]. */
2633 read_string (struct sfm_reader *r, char *buffer, size_t size)
2636 read_bytes (r, buffer, size - 1);
2637 buffer[size - 1] = '\0';
2640 /* Skips BYTES bytes forward in R. */
2642 skip_bytes (struct sfm_reader *r, size_t bytes)
2647 size_t chunk = MIN (sizeof buffer, bytes);
2648 read_bytes (r, buffer, chunk);
2653 static const struct casereader_class sys_file_casereader_class =
2655 sys_file_casereader_read,
2656 sys_file_casereader_destroy,