1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-strtod.h"
54 #include "gl/c-ctype.h"
55 #include "gl/inttostr.h"
56 #include "gl/localcharset.h"
57 #include "gl/minmax.h"
58 #include "gl/unlocked-io.h"
59 #include "gl/xalloc.h"
63 #define _(msgid) gettext (msgid)
64 #define N_(msgid) (msgid)
68 /* subtypes 0-2 unknown */
69 EXT_INTEGER = 3, /* Machine integer info. */
70 EXT_FLOAT = 4, /* Machine floating-point info. */
71 EXT_VAR_SETS = 5, /* Variable sets. */
72 EXT_DATE = 6, /* DATE. */
73 EXT_MRSETS = 7, /* Multiple response sets. */
74 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
75 /* subtypes 9-10 unknown */
76 EXT_DISPLAY = 11, /* Variable display parameters. */
77 /* subtype 12 unknown */
78 EXT_LONG_NAMES = 13, /* Long variable names. */
79 EXT_LONG_STRINGS = 14, /* Long strings. */
80 /* subtype 15 unknown */
81 EXT_NCASES = 16, /* Extended number of cases. */
82 EXT_FILE_ATTRS = 17, /* Data file attributes. */
83 EXT_VAR_ATTRS = 18, /* Variable attributes. */
84 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
85 EXT_ENCODING = 20, /* Character encoding. */
86 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
87 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
90 /* Fields from the top-level header record. */
91 struct sfm_header_record
93 char magic[5]; /* First 4 bytes of file, then null. */
94 int weight_idx; /* 0 if unweighted, otherwise a var index. */
95 int nominal_case_size; /* Number of var positions. */
97 /* These correspond to the members of struct sfm_file_info or a dictionary
98 but in the system file's encoding rather than ASCII. */
99 char creation_date[10]; /* "dd mmm yy". */
100 char creation_time[9]; /* "hh:mm:ss". */
101 char eye_catcher[61]; /* Eye-catcher string, then product name. */
102 char file_label[65]; /* File label. */
105 struct sfm_var_record
112 int missing_value_code;
115 struct variable *var;
118 struct sfm_value_label
124 struct sfm_value_label_record
127 struct sfm_value_label *labels;
134 struct sfm_document_record
141 struct sfm_extension_record
143 off_t pos; /* Starting offset in file. */
144 size_t size; /* Size of data elements. */
145 size_t count; /* Number of data elements. */
146 void *data; /* Contents. */
149 /* System file reader. */
152 /* Resource tracking. */
153 struct pool *pool; /* All system file state. */
154 jmp_buf bail_out; /* longjmp() target for error handling. */
157 struct file_handle *fh; /* File handle. */
158 struct fh_lock *lock; /* Mutual exclusion for file handle. */
159 FILE *file; /* File stream. */
160 off_t pos; /* Position in file. */
161 bool error; /* I/O or corruption error? */
162 struct caseproto *proto; /* Format of output cases. */
165 enum integer_format integer_format; /* On-disk integer format. */
166 enum float_format float_format; /* On-disk floating point format. */
167 struct sfm_var *sfm_vars; /* Variables. */
168 size_t sfm_var_cnt; /* Number of variables. */
169 casenumber case_cnt; /* Number of cases */
170 const char *encoding; /* String encoding. */
173 bool compressed; /* File is compressed? */
174 double bias; /* Compression bias, usually 100.0. */
175 uint8_t opcodes[8]; /* Current block of opcodes. */
176 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
177 bool corruption_warning; /* Warned about possible corruption? */
180 static const struct casereader_class sys_file_casereader_class;
182 static bool close_reader (struct sfm_reader *);
184 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
185 const struct sfm_var_record *,
188 static void sys_msg (struct sfm_reader *r, off_t, int class,
189 const char *format, va_list args)
190 PRINTF_FORMAT (4, 0);
191 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
192 PRINTF_FORMAT (3, 4);
193 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
197 static void read_bytes (struct sfm_reader *, void *, size_t);
198 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
199 static int read_int (struct sfm_reader *);
200 static double read_float (struct sfm_reader *);
201 static void read_string (struct sfm_reader *, char *, size_t);
202 static void skip_bytes (struct sfm_reader *, size_t);
204 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
205 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
207 static void read_variable_record (struct sfm_reader *,
208 struct sfm_var_record *);
209 static void read_value_label_record (struct sfm_reader *,
210 struct sfm_value_label_record *,
212 static struct sfm_document_record *read_document_record (struct sfm_reader *);
213 static struct sfm_extension_record *read_extension_record (
214 struct sfm_reader *, int subtype);
215 static void skip_extension_record (struct sfm_reader *, int subtype);
217 static const char *choose_encoding (
219 const struct sfm_header_record *,
220 const struct sfm_extension_record *ext_integer,
221 const struct sfm_extension_record *ext_encoding);
223 static struct text_record *open_text_record (
224 struct sfm_reader *, const struct sfm_extension_record *,
225 bool recode_to_utf8);
226 static void close_text_record (struct sfm_reader *,
227 struct text_record *);
228 static bool read_variable_to_value_pair (struct sfm_reader *,
230 struct text_record *,
231 struct variable **var, char **value);
232 static void text_warn (struct sfm_reader *r, struct text_record *text,
233 const char *format, ...)
234 PRINTF_FORMAT (3, 4);
235 static char *text_get_token (struct text_record *,
236 struct substring delimiters, char *delimiter);
237 static bool text_match (struct text_record *, char c);
238 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
239 struct text_record *,
240 struct substring delimiters,
242 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
243 struct text_record *,
244 struct substring delimiters,
246 static const char *text_parse_counted_string (struct sfm_reader *,
247 struct text_record *);
248 static size_t text_pos (const struct text_record *);
250 static bool close_reader (struct sfm_reader *r);
252 /* Dictionary reader. */
260 static void read_header (struct sfm_reader *, struct sfm_read_info *,
261 struct sfm_header_record *);
262 static void parse_header (struct sfm_reader *,
263 const struct sfm_header_record *,
264 struct sfm_read_info *, struct dictionary *);
265 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
266 struct sfm_var_record *, size_t n);
267 static void parse_format_spec (struct sfm_reader *, off_t pos,
268 unsigned int format, enum which_format,
269 struct variable *, int *format_warning_cnt);
270 static void parse_document (struct dictionary *, struct sfm_document_record *);
271 static void parse_display_parameters (struct sfm_reader *,
272 const struct sfm_extension_record *,
273 struct dictionary *);
274 static void parse_machine_integer_info (struct sfm_reader *,
275 const struct sfm_extension_record *,
276 struct sfm_read_info *);
277 static void parse_machine_float_info (struct sfm_reader *,
278 const struct sfm_extension_record *);
279 static void parse_mrsets (struct sfm_reader *,
280 const struct sfm_extension_record *,
281 struct dictionary *);
282 static void parse_long_var_name_map (struct sfm_reader *,
283 const struct sfm_extension_record *,
284 struct dictionary *);
285 static void parse_long_string_map (struct sfm_reader *,
286 const struct sfm_extension_record *,
287 struct dictionary *);
288 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
289 const struct sfm_var_record *,
291 const struct sfm_value_label_record *);
292 static void parse_data_file_attributes (struct sfm_reader *,
293 const struct sfm_extension_record *,
294 struct dictionary *);
295 static void parse_variable_attributes (struct sfm_reader *,
296 const struct sfm_extension_record *,
297 struct dictionary *);
298 static void parse_long_string_value_labels (struct sfm_reader *,
299 const struct sfm_extension_record *,
300 struct dictionary *);
302 /* Frees the strings inside INFO. */
304 sfm_read_info_destroy (struct sfm_read_info *info)
308 free (info->creation_date);
309 free (info->creation_time);
310 free (info->product);
314 /* Opens the system file designated by file handle FH for reading. Reads the
315 system file's dictionary into *DICT.
317 Ordinarily the reader attempts to automatically detect the character
318 encoding based on the file's contents. This isn't always possible,
319 especially for files written by old versions of SPSS or PSPP, so specifying
320 a nonnull ENCODING overrides the choice of character encoding.
322 If INFO is non-null, then it receives additional info about the system file,
323 which the caller must eventually free with sfm_read_info_destroy() when it
324 is no longer needed. */
326 sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
327 struct dictionary **dictp, struct sfm_read_info *infop)
329 struct sfm_reader *volatile r = NULL;
330 struct sfm_read_info *volatile info;
332 struct sfm_header_record header;
334 struct sfm_var_record *vars;
335 size_t n_vars, allocated_vars;
337 struct sfm_value_label_record *labels;
338 size_t n_labels, allocated_labels;
340 struct sfm_document_record *document;
342 struct sfm_extension_record *extensions[32];
344 struct dictionary *volatile dict = NULL;
347 /* Create and initialize reader. */
348 r = pool_create_container (struct sfm_reader, pool);
354 r->opcode_idx = sizeof r->opcodes;
355 r->corruption_warning = false;
357 info = infop ? infop : xmalloc (sizeof *info);
358 memset (info, 0, sizeof *info);
360 /* TRANSLATORS: this fragment will be interpolated into
361 messages in fh_lock() that identify types of files. */
362 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
366 r->file = fn_open (fh_get_file_name (fh), "rb");
369 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
370 fh_get_file_name (r->fh), strerror (errno));
374 if (setjmp (r->bail_out))
378 read_header (r, info, &header);
381 n_vars = allocated_vars = 0;
384 n_labels = allocated_labels = 0;
388 memset (extensions, 0, sizeof extensions);
398 read_int (r); /* Skip filler. */
405 if (n_vars >= allocated_vars)
406 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
408 read_variable_record (r, &vars[n_vars++]);
412 if (n_labels >= allocated_labels)
413 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
415 read_value_label_record (r, &labels[n_labels++], n_vars);
419 /* A Type 4 record is always immediately after a type 3 record,
420 so the code for type 3 records reads the type 4 record too. */
421 sys_error (r, r->pos, _("Misplaced type 4 record."));
424 if (document != NULL)
425 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
426 document = read_document_record (r);
430 subtype = read_int (r);
431 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
434 _("Unrecognized record type 7, subtype %d. Please "
435 "send a copy of this file, and the syntax which "
436 "created it to %s."),
437 subtype, PACKAGE_BUGREPORT);
438 skip_extension_record (r, subtype);
440 else if (extensions[subtype] != NULL)
443 _("Record type 7, subtype %d found here has the same "
444 "type as the record found near offset 0x%llx. "
445 "Please send a copy of this file, and the syntax "
446 "which created it to %s."),
447 subtype, (long long int) extensions[subtype]->pos,
449 skip_extension_record (r, subtype);
452 extensions[subtype] = read_extension_record (r, subtype);
456 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
461 /* Now actually parse what we read.
463 First, figure out the correct character encoding, because this determines
464 how the rest of the header data is to be interpreted. */
465 dict = dict_create (encoding
467 : choose_encoding (r, &header, extensions[EXT_INTEGER],
468 extensions[EXT_ENCODING]));
469 r->encoding = dict_get_encoding (dict);
471 /* These records don't use variables at all. */
472 if (document != NULL)
473 parse_document (dict, document);
475 if (extensions[EXT_INTEGER] != NULL)
476 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
478 if (extensions[EXT_FLOAT] != NULL)
479 parse_machine_float_info (r, extensions[EXT_FLOAT]);
481 if (extensions[EXT_FILE_ATTRS] != NULL)
482 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
484 parse_header (r, &header, info, dict);
486 /* Parse the variable records, the basis of almost everything else. */
487 parse_variable_records (r, dict, vars, n_vars);
489 /* Parse value labels and the weight variable immediately after the variable
490 records. These records use indexes into var_recs[], so we must parse them
491 before those indexes become invalidated by very long string variables. */
492 for (i = 0; i < n_labels; i++)
493 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
494 if (header.weight_idx != 0)
496 struct variable *weight_var;
498 weight_var = lookup_var_by_index (r, 76, vars, n_vars,
500 if (var_is_numeric (weight_var))
501 dict_set_weight (dict, weight_var);
503 sys_error (r, -1, _("Weighting variable must be numeric "
504 "(not string variable `%s')."),
505 var_get_name (weight_var));
508 if (extensions[EXT_DISPLAY] != NULL)
509 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
511 /* The following records use short names, so they need to be parsed before
512 parse_long_var_name_map() changes short names to long names. */
513 if (extensions[EXT_MRSETS] != NULL)
514 parse_mrsets (r, extensions[EXT_MRSETS], dict);
516 if (extensions[EXT_MRSETS2] != NULL)
517 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
519 if (extensions[EXT_LONG_STRINGS] != NULL)
520 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
522 /* Now rename variables to their long names. */
523 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
525 /* The following records use long names, so they need to follow renaming. */
526 if (extensions[EXT_VAR_ATTRS] != NULL)
527 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
529 if (extensions[EXT_LONG_LABELS] != NULL)
530 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
532 /* Warn if the actual amount of data per case differs from the
533 amount that the header claims. SPSS version 13 gets this
534 wrong when very long strings are involved, so don't warn in
536 if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
537 && info->version_major != 13)
538 sys_warn (r, -1, _("File header claims %d variable positions but "
539 "%zu were read from file."),
540 header.nominal_case_size, n_vars);
542 /* Create an index of dictionary variable widths for
543 sfm_read_case to use. We cannot use the `struct variable's
544 from the dictionary we created, because the caller owns the
545 dictionary and may destroy or modify its variables. */
546 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
547 pool_register (r->pool, free, r->sfm_vars);
548 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
553 sfm_read_info_destroy (info);
557 return casereader_create_sequential
559 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
560 &sys_file_casereader_class, r);
565 sfm_read_info_destroy (info);
575 /* Closes a system file after we're done with it.
576 Returns true if an I/O error has occurred on READER, false
579 close_reader (struct sfm_reader *r)
588 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
590 msg (ME, _("Error closing system file `%s': %s."),
591 fh_get_file_name (r->fh), strerror (errno));
601 pool_destroy (r->pool);
606 /* Destroys READER. */
608 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
610 struct sfm_reader *r = r_;
614 /* Returns true if FILE is an SPSS system file,
617 sfm_detect (FILE *file)
621 if (fread (magic, 4, 1, file) != 1)
625 return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
628 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
629 except for the string fields in *INFO, which parse_header() will initialize
630 later once the file's encoding is known. */
632 read_header (struct sfm_reader *r, struct sfm_read_info *info,
633 struct sfm_header_record *header)
635 uint8_t raw_layout_code[4];
638 read_string (r, header->magic, sizeof header->magic);
639 read_string (r, header->eye_catcher, sizeof header->eye_catcher);
641 if (strcmp (ASCII_MAGIC, header->magic)
642 && strcmp (EBCDIC_MAGIC, header->magic))
643 sys_error (r, 0, _("This is not an SPSS system file."));
645 /* Identify integer format. */
646 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
647 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
649 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
651 || (r->integer_format != INTEGER_MSB_FIRST
652 && r->integer_format != INTEGER_LSB_FIRST))
653 sys_error (r, 64, _("This is not an SPSS system file."));
655 header->nominal_case_size = read_int (r);
656 if (header->nominal_case_size < 0
657 || header->nominal_case_size > INT_MAX / 16)
658 header->nominal_case_size = -1;
660 r->compressed = read_int (r) != 0;
662 header->weight_idx = read_int (r);
664 r->case_cnt = read_int (r);
665 if ( r->case_cnt > INT_MAX / 2)
668 /* Identify floating-point format and obtain compression bias. */
669 read_bytes (r, raw_bias, sizeof raw_bias);
670 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
672 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
674 if (memcmp (raw_bias, zero_bias, 8))
675 sys_warn (r, r->pos - 8,
676 _("Compression bias is not the usual "
677 "value of 100, or system file uses unrecognized "
678 "floating-point format."));
681 /* Some software is known to write all-zeros to this
682 field. Such software also writes floating-point
683 numbers in the format that we expect by default
684 (it seems that all software most likely does, in
685 reality), so don't warn in this case. */
688 if (r->integer_format == INTEGER_MSB_FIRST)
689 r->float_format = FLOAT_IEEE_DOUBLE_BE;
691 r->float_format = FLOAT_IEEE_DOUBLE_LE;
693 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
695 read_string (r, header->creation_date, sizeof header->creation_date);
696 read_string (r, header->creation_time, sizeof header->creation_time);
697 read_string (r, header->file_label, sizeof header->file_label);
700 info->integer_format = r->integer_format;
701 info->float_format = r->float_format;
702 info->compressed = r->compressed;
703 info->case_cnt = r->case_cnt;
706 /* Reads a variable (type 2) record from R into RECORD. */
708 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
710 int has_variable_label;
712 memset (record, 0, sizeof *record);
714 record->pos = r->pos;
715 record->width = read_int (r);
716 has_variable_label = read_int (r);
717 record->missing_value_code = read_int (r);
718 record->print_format = read_int (r);
719 record->write_format = read_int (r);
720 read_bytes (r, record->name, sizeof record->name);
722 if (has_variable_label == 1)
724 enum { MAX_LABEL_LEN = 255 };
725 size_t len, read_len;
729 /* Read up to MAX_LABEL_LEN bytes of label. */
730 read_len = MIN (MAX_LABEL_LEN, len);
731 record->label = pool_malloc (r->pool, read_len + 1);
732 read_string (r, record->label, read_len + 1);
734 /* Skip unread label bytes. */
735 skip_bytes (r, len - read_len);
737 /* Skip label padding up to multiple of 4 bytes. */
738 skip_bytes (r, ROUND_UP (len, 4) - len);
740 else if (has_variable_label != 0)
741 sys_error (r, record->pos,
742 _("Variable label indicator field is not 0 or 1."));
744 /* Set missing values. */
745 if (record->missing_value_code != 0)
747 int code = record->missing_value_code;
748 if (record->width == 0)
750 if (code < -3 || code > 3 || code == -1)
751 sys_error (r, record->pos,
752 _("Numeric missing value indicator field is not "
753 "-3, -2, 0, 1, 2, or 3."));
757 if (code < 1 || code > 3)
758 sys_error (r, record->pos,
759 _("String missing value indicator field is not "
763 read_bytes (r, record->missing, 8 * abs (code));
767 /* Reads value labels from R into RECORD. */
769 read_value_label_record (struct sfm_reader *r,
770 struct sfm_value_label_record *record,
775 /* Read type 3 record. */
776 record->pos = r->pos;
777 record->n_labels = read_int (r);
778 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
779 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
781 record->labels = pool_nmalloc (r->pool, record->n_labels,
782 sizeof *record->labels);
783 for (i = 0; i < record->n_labels; i++)
785 struct sfm_value_label *label = &record->labels[i];
786 unsigned char label_len;
789 read_bytes (r, label->value, sizeof label->value);
791 /* Read label length. */
792 read_bytes (r, &label_len, sizeof label_len);
793 padded_len = ROUND_UP (label_len + 1, 8);
795 /* Read label, padding. */
796 label->label = pool_malloc (r->pool, padded_len + 1);
797 read_bytes (r, label->label, padded_len - 1);
798 label->label[label_len] = '\0';
801 /* Read record type of type 4 record. */
802 if (read_int (r) != 4)
803 sys_error (r, r->pos - 4,
804 _("Variable index record (type 4) does not immediately "
805 "follow value label record (type 3) as it should."));
807 /* Read number of variables associated with value label from type 4
809 record->n_vars = read_int (r);
810 if (record->n_vars < 1 || record->n_vars > n_vars)
811 sys_error (r, r->pos - 4,
812 _("Number of variables associated with a value label (%zu) "
813 "is not between 1 and the number of variables (%zu)."),
814 record->n_vars, n_vars);
815 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
816 for (i = 0; i < record->n_vars; i++)
817 record->vars[i] = read_int (r);
820 /* Reads a document record from R and returns it. */
821 static struct sfm_document_record *
822 read_document_record (struct sfm_reader *r)
824 struct sfm_document_record *record;
827 record = pool_malloc (r->pool, sizeof *record);
828 record->pos = r->pos;
830 n_lines = read_int (r);
831 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
832 sys_error (r, record->pos,
833 _("Number of document lines (%d) "
834 "must be greater than 0 and less than %d."),
835 n_lines, INT_MAX / DOC_LINE_LENGTH);
837 record->n_lines = n_lines;
838 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
839 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
845 read_extension_record_header (struct sfm_reader *r, int subtype,
846 struct sfm_extension_record *record)
848 record->pos = r->pos;
849 record->size = read_int (r);
850 record->count = read_int (r);
852 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
853 allows an extra byte for a null terminator, used by some
854 extension processing routines. */
855 if (record->size != 0
856 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
857 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
860 /* Reads an extension record from R into RECORD. */
861 static struct sfm_extension_record *
862 read_extension_record (struct sfm_reader *r, int subtype)
864 struct extension_record_type
871 static const struct extension_record_type types[] =
873 /* Implemented record types. */
874 { EXT_INTEGER, 4, 8 },
876 { EXT_MRSETS, 1, 0 },
877 { EXT_DISPLAY, 4, 0 },
878 { EXT_LONG_NAMES, 1, 0 },
879 { EXT_LONG_STRINGS, 1, 0 },
880 { EXT_NCASES, 8, 2 },
881 { EXT_FILE_ATTRS, 1, 0 },
882 { EXT_VAR_ATTRS, 1, 0 },
883 { EXT_MRSETS2, 1, 0 },
884 { EXT_ENCODING, 1, 0 },
885 { EXT_LONG_LABELS, 1, 0 },
887 /* Ignored record types. */
888 { EXT_VAR_SETS, 0, 0 },
890 { EXT_DATA_ENTRY, 0, 0 },
891 { EXT_DATAVIEW, 0, 0 },
894 const struct extension_record_type *type;
895 struct sfm_extension_record *record;
898 record = pool_malloc (r->pool, sizeof *record);
899 read_extension_record_header (r, subtype, record);
900 n_bytes = record->count * record->size;
902 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
903 if (subtype == type->subtype)
905 if (type->size > 0 && record->size != type->size)
906 sys_warn (r, record->pos,
907 _("Record type 7, subtype %d has bad size %zu "
908 "(expected %d)."), subtype, record->size, type->size);
909 else if (type->count > 0 && record->count != type->count)
910 sys_warn (r, record->pos,
911 _("Record type 7, subtype %d has bad count %zu "
912 "(expected %d)."), subtype, record->count, type->count);
913 else if (type->count == 0 && type->size == 0)
915 /* Ignore this record. */
919 char *data = pool_malloc (r->pool, n_bytes + 1);
920 data[n_bytes] = '\0';
923 read_bytes (r, record->data, n_bytes);
930 sys_warn (r, record->pos,
931 _("Unrecognized record type 7, subtype %d. Please send a "
932 "copy of this file, and the syntax which created it to %s."),
933 subtype, PACKAGE_BUGREPORT);
936 skip_bytes (r, n_bytes);
941 skip_extension_record (struct sfm_reader *r, int subtype)
943 struct sfm_extension_record record;
945 read_extension_record_header (r, subtype, &record);
946 skip_bytes (r, record.count * record.size);
950 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
951 struct sfm_read_info *info, struct dictionary *dict)
953 const char *dict_encoding = dict_get_encoding (dict);
954 struct substring product;
955 struct substring label;
957 /* Convert file label to UTF-8 and put it into DICT. */
958 label = recode_substring_pool ("UTF-8", dict_encoding,
959 ss_cstr (header->file_label), r->pool);
960 ss_trim (&label, ss_cstr (" "));
961 label.string[label.length] = '\0';
962 dict_set_label (dict, label.string);
964 /* Put creation date and time in UTF-8 into INFO. */
965 info->creation_date = recode_string ("UTF-8", dict_encoding,
966 header->creation_date, -1);
967 info->creation_time = recode_string ("UTF-8", dict_encoding,
968 header->creation_time, -1);
970 /* Put product name into INFO, dropping eye-catcher string if present. */
971 product = recode_substring_pool ("UTF-8", dict_encoding,
972 ss_cstr (header->eye_catcher), r->pool);
973 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
974 ss_trim (&product, ss_cstr (" "));
975 info->product = ss_xstrdup (product);
978 /* Reads a variable (type 2) record from R and adds the
979 corresponding variable to DICT.
980 Also skips past additional variable records for long string
983 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
984 struct sfm_var_record *var_recs, size_t n_var_recs)
986 const char *dict_encoding = dict_get_encoding (dict);
987 struct sfm_var_record *rec;
990 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
992 struct variable *var;
997 name = recode_string_pool ("UTF-8", dict_encoding,
998 rec->name, 8, r->pool);
999 name[strcspn (name, " ")] = '\0';
1001 if (!dict_id_is_valid (dict, name, false)
1002 || name[0] == '$' || name[0] == '#')
1003 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1005 if (rec->width < 0 || rec->width > 255)
1006 sys_error (r, rec->pos,
1007 _("Bad width %d for variable %s."), rec->width, name);
1009 var = rec->var = dict_create_var (dict, name, rec->width);
1011 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
1013 /* Set the short name the same as the long name. */
1014 var_set_short_name (var, 0, name);
1016 /* Get variable label, if any. */
1021 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1022 rec->label, -1, r->pool);
1023 var_set_label (var, utf8_label, false);
1026 /* Set missing values. */
1027 if (rec->missing_value_code != 0)
1029 int width = var_get_width (var);
1030 struct missing_values mv;
1032 mv_init_pool (r->pool, &mv, width);
1033 if (var_is_numeric (var))
1035 bool has_range = rec->missing_value_code < 0;
1036 int n_discrete = (has_range
1037 ? rec->missing_value_code == -3
1038 : rec->missing_value_code);
1043 double low = parse_float (r, rec->missing, 0);
1044 double high = parse_float (r, rec->missing, 8);
1046 /* Deal with SPSS 21 change in representation. */
1050 mv_add_range (&mv, low, high);
1054 for (i = 0; i < n_discrete; i++)
1056 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1064 value_init_pool (r->pool, &value, width);
1065 value_set_missing (&value, width);
1066 for (i = 0; i < rec->missing_value_code; i++)
1068 uint8_t *s = value_str_rw (&value, width);
1069 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1070 mv_add_str (&mv, s);
1073 var_set_missing_values (var, &mv);
1077 parse_format_spec (r, rec->pos + 12, rec->print_format,
1078 PRINT_FORMAT, var, &n_warnings);
1079 parse_format_spec (r, rec->pos + 16, rec->write_format,
1080 WRITE_FORMAT, var, &n_warnings);
1082 /* Account for values.
1083 Skip long string continuation records, if any. */
1084 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1085 for (i = 1; i < n_values; i++)
1086 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1087 sys_error (r, rec->pos, _("Missing string continuation record."));
1092 /* Translates the format spec from sysfile format to internal
1095 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1096 enum which_format which, struct variable *v,
1099 const int max_warnings = 8;
1100 uint8_t raw_type = format >> 16;
1101 uint8_t w = format >> 8;
1110 ok = (fmt_from_io (raw_type, &f.type)
1111 && fmt_check_output (&f)
1112 && fmt_check_width_compat (&f, var_get_width (v)));
1117 if (which == PRINT_FORMAT)
1118 var_set_print_format (v, &f);
1120 var_set_write_format (v, &f);
1122 else if (format == 0)
1124 /* Actually observed in the wild. No point in warning about it. */
1126 else if (++*n_warnings <= max_warnings)
1128 if (which == PRINT_FORMAT)
1129 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1131 var_get_name (v), var_get_width (v), format);
1133 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1135 var_get_name (v), var_get_width (v), format);
1137 if (*n_warnings == max_warnings)
1138 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1143 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1147 for (p = record->documents;
1148 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1149 p += DOC_LINE_LENGTH)
1151 struct substring line;
1153 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1154 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1155 ss_rtrim (&line, ss_cstr (" "));
1156 line.string[line.length] = '\0';
1158 dict_add_document_line (dict, line.string, false);
1164 /* Parses record type 7, subtype 3. */
1166 parse_machine_integer_info (struct sfm_reader *r,
1167 const struct sfm_extension_record *record,
1168 struct sfm_read_info *info)
1170 int float_representation, expected_float_format;
1171 int integer_representation, expected_integer_format;
1173 /* Save version info. */
1174 info->version_major = parse_int (r, record->data, 0);
1175 info->version_minor = parse_int (r, record->data, 4);
1176 info->version_revision = parse_int (r, record->data, 8);
1178 /* Check floating point format. */
1179 float_representation = parse_int (r, record->data, 16);
1180 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1181 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1182 expected_float_format = 1;
1183 else if (r->float_format == FLOAT_Z_LONG)
1184 expected_float_format = 2;
1185 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1186 expected_float_format = 3;
1189 if (float_representation != expected_float_format)
1190 sys_error (r, record->pos, _("Floating-point representation indicated by "
1191 "system file (%d) differs from expected (%d)."),
1192 float_representation, expected_float_format);
1194 /* Check integer format. */
1195 integer_representation = parse_int (r, record->data, 24);
1196 if (r->integer_format == INTEGER_MSB_FIRST)
1197 expected_integer_format = 1;
1198 else if (r->integer_format == INTEGER_LSB_FIRST)
1199 expected_integer_format = 2;
1202 if (integer_representation != expected_integer_format)
1203 sys_warn (r, record->pos,
1204 _("Integer format indicated by system file (%d) "
1205 "differs from expected (%d)."),
1206 integer_representation, expected_integer_format);
1211 choose_encoding (struct sfm_reader *r,
1212 const struct sfm_header_record *header,
1213 const struct sfm_extension_record *ext_integer,
1214 const struct sfm_extension_record *ext_encoding)
1216 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1219 return ext_encoding->data;
1221 /* But EXT_INTEGER is better than nothing as a fallback. */
1224 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1225 const char *encoding;
1234 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1235 respectively. However, there are known to be many files in the wild
1236 with character code 2, yet have data which are clearly not ASCII.
1237 Therefore we ignore these values. */
1244 encoding = sys_get_encoding_from_codepage (codepage);
1245 if (encoding != NULL)
1251 /* If the file magic number is EBCDIC then its character data is too. */
1252 if (!strcmp (header->magic, EBCDIC_MAGIC))
1255 return locale_charset ();
1258 /* Parses record type 7, subtype 4. */
1260 parse_machine_float_info (struct sfm_reader *r,
1261 const struct sfm_extension_record *record)
1263 double sysmis = parse_float (r, record->data, 0);
1264 double highest = parse_float (r, record->data, 8);
1265 double lowest = parse_float (r, record->data, 16);
1267 if (sysmis != SYSMIS)
1268 sys_warn (r, record->pos,
1269 _("File specifies unexpected value %g (%a) as %s, "
1270 "instead of %g (%a)."),
1271 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1273 if (highest != HIGHEST)
1274 sys_warn (r, record->pos,
1275 _("File specifies unexpected value %g (%a) as %s, "
1276 "instead of %g (%a)."),
1277 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1279 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1280 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1281 appears in a context (missing values) where SYSMIS cannot. */
1282 if (lowest != LOWEST && lowest != SYSMIS)
1283 sys_warn (r, record->pos,
1284 _("File specifies unexpected value %g (%a) as %s, "
1285 "instead of %g (%a) or %g (%a)."),
1286 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1289 /* Parses record type 7, subtype 7 or 19. */
1291 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1292 struct dictionary *dict)
1294 struct text_record *text;
1295 struct mrset *mrset;
1297 text = open_text_record (r, record, false);
1300 const char *counted = NULL;
1303 struct stringi_set var_names;
1304 size_t allocated_vars;
1308 mrset = xzalloc (sizeof *mrset);
1310 name = text_get_token (text, ss_cstr ("="), NULL);
1313 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1315 if (mrset->name[0] != '$')
1317 sys_warn (r, record->pos,
1318 _("`%s' does not begin with `$' at offset %zu "
1319 "in MRSETS record."), mrset->name, text_pos (text));
1323 if (text_match (text, 'C'))
1325 mrset->type = MRSET_MC;
1326 if (!text_match (text, ' '))
1328 sys_warn (r, record->pos,
1329 _("Missing space following `%c' at offset %zu "
1330 "in MRSETS record."), 'C', text_pos (text));
1334 else if (text_match (text, 'D'))
1336 mrset->type = MRSET_MD;
1337 mrset->cat_source = MRSET_VARLABELS;
1339 else if (text_match (text, 'E'))
1343 mrset->type = MRSET_MD;
1344 mrset->cat_source = MRSET_COUNTEDVALUES;
1345 if (!text_match (text, ' '))
1347 sys_warn (r, record->pos,
1348 _("Missing space following `%c' at offset %zu "
1349 "in MRSETS record."), 'E', text_pos (text));
1353 number = text_get_token (text, ss_cstr (" "), NULL);
1354 if (!strcmp (number, "11"))
1355 mrset->label_from_var_label = true;
1356 else if (strcmp (number, "1"))
1357 sys_warn (r, record->pos,
1358 _("Unexpected label source value `%s' following `E' "
1359 "at offset %zu in MRSETS record."),
1360 number, text_pos (text));
1364 sys_warn (r, record->pos,
1365 _("Missing `C', `D', or `E' at offset %zu "
1366 "in MRSETS record."),
1371 if (mrset->type == MRSET_MD)
1373 counted = text_parse_counted_string (r, text);
1374 if (counted == NULL)
1378 label = text_parse_counted_string (r, text);
1381 if (label[0] != '\0')
1382 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1384 stringi_set_init (&var_names);
1389 const char *raw_var_name;
1390 struct variable *var;
1393 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1394 if (raw_var_name == NULL)
1396 sys_warn (r, record->pos,
1397 _("Missing new-line parsing variable names "
1398 "at offset %zu in MRSETS record."),
1402 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1404 var = dict_lookup_var (dict, var_name);
1410 if (!stringi_set_insert (&var_names, var_name))
1412 sys_warn (r, record->pos,
1413 _("Duplicate variable name %s "
1414 "at offset %zu in MRSETS record."),
1415 var_name, text_pos (text));
1421 if (mrset->label == NULL && mrset->label_from_var_label
1422 && var_has_label (var))
1423 mrset->label = xstrdup (var_get_label (var));
1426 && var_get_type (var) != var_get_type (mrset->vars[0]))
1428 sys_warn (r, record->pos,
1429 _("MRSET %s contains both string and "
1430 "numeric variables."), name);
1433 width = MIN (width, var_get_width (var));
1435 if (mrset->n_vars >= allocated_vars)
1436 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1437 sizeof *mrset->vars);
1438 mrset->vars[mrset->n_vars++] = var;
1440 while (delimiter != '\n');
1442 if (mrset->n_vars < 2)
1444 sys_warn (r, record->pos,
1445 _("MRSET %s has only %zu variables."), mrset->name,
1447 mrset_destroy (mrset);
1448 stringi_set_destroy (&var_names);
1452 if (mrset->type == MRSET_MD)
1454 mrset->width = width;
1455 value_init (&mrset->counted, width);
1457 mrset->counted.f = c_strtod (counted, NULL);
1459 value_copy_str_rpad (&mrset->counted, width,
1460 (const uint8_t *) counted, ' ');
1463 dict_add_mrset (dict, mrset);
1465 stringi_set_destroy (&var_names);
1467 mrset_destroy (mrset);
1468 close_text_record (r, text);
1471 /* Read record type 7, subtype 11, which specifies how variables
1472 should be displayed in GUI environments. */
1474 parse_display_parameters (struct sfm_reader *r,
1475 const struct sfm_extension_record *record,
1476 struct dictionary *dict)
1478 bool includes_width;
1479 bool warned = false;
1484 n_vars = dict_get_var_cnt (dict);
1485 if (record->count == 3 * n_vars)
1486 includes_width = true;
1487 else if (record->count == 2 * n_vars)
1488 includes_width = false;
1491 sys_warn (r, record->pos,
1492 _("Extension 11 has bad count %zu (for %zu variables)."),
1493 record->count, n_vars);
1498 for (i = 0; i < n_vars; ++i)
1500 struct variable *v = dict_get_var (dict, i);
1501 int measure, width, align;
1503 measure = parse_int (r, record->data, ofs);
1508 width = parse_int (r, record->data, ofs);
1514 align = parse_int (r, record->data, ofs);
1517 /* SPSS sometimes seems to set variables' measure to zero. */
1521 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1524 sys_warn (r, record->pos,
1525 _("Invalid variable display parameters for variable "
1526 "%zu (%s). Default parameters substituted."),
1527 i, var_get_name (v));
1532 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1533 : measure == 2 ? MEASURE_ORDINAL
1535 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1536 : align == 1 ? ALIGN_RIGHT
1539 /* Older versions (SPSS 9.0) sometimes set the display
1540 width to zero. This causes confusion in the GUI, so
1541 only set the width if it is nonzero. */
1543 var_set_display_width (v, width);
1548 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1549 const char *new_name)
1551 size_t n_short_names;
1555 /* Renaming a variable may clear its short names, but we
1556 want to retain them, so we save them and re-set them
1558 n_short_names = var_get_short_name_cnt (var);
1559 short_names = xnmalloc (n_short_names, sizeof *short_names);
1560 for (i = 0; i < n_short_names; i++)
1562 const char *s = var_get_short_name (var, i);
1563 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1566 /* Set long name. */
1567 dict_rename_var (dict, var, new_name);
1569 /* Restore short names. */
1570 for (i = 0; i < n_short_names; i++)
1572 var_set_short_name (var, i, short_names[i]);
1573 free (short_names[i]);
1578 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1579 to each short name. Modifies variable names in DICT accordingly. */
1581 parse_long_var_name_map (struct sfm_reader *r,
1582 const struct sfm_extension_record *record,
1583 struct dictionary *dict)
1585 struct text_record *text;
1586 struct variable *var;
1591 /* There are no long variable names. Use the short variable names,
1592 converted to lowercase, as the long variable names. */
1595 for (i = 0; i < dict_get_var_cnt (dict); i++)
1597 struct variable *var = dict_get_var (dict, i);
1600 new_name = utf8_to_lower (var_get_name (var));
1601 rename_var_and_save_short_names (dict, var, new_name);
1608 /* Rename each of the variables, one by one. (In a correctly constructed
1609 system file, this cannot create any intermediate duplicate variable names,
1610 because all of the new variable names are longer than any of the old
1611 variable names and thus there cannot be any overlaps.) */
1612 text = open_text_record (r, record, true);
1613 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1615 /* Validate long name. */
1616 if (!dict_id_is_valid (dict, long_name, false))
1618 sys_warn (r, record->pos,
1619 _("Long variable mapping from %s to invalid "
1620 "variable name `%s'."),
1621 var_get_name (var), long_name);
1625 /* Identify any duplicates. */
1626 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
1627 && dict_lookup_var (dict, long_name) != NULL)
1629 sys_warn (r, record->pos,
1630 _("Duplicate long variable name `%s'."), long_name);
1634 rename_var_and_save_short_names (dict, var, long_name);
1636 close_text_record (r, text);
1639 /* Reads record type 7, subtype 14, which gives the real length
1640 of each very long string. Rearranges DICT accordingly. */
1642 parse_long_string_map (struct sfm_reader *r,
1643 const struct sfm_extension_record *record,
1644 struct dictionary *dict)
1646 struct text_record *text;
1647 struct variable *var;
1650 text = open_text_record (r, record, true);
1651 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1653 size_t idx = var_get_dict_index (var);
1659 length = strtol (length_s, NULL, 10);
1660 if (length < 1 || length > MAX_STRING)
1662 sys_warn (r, record->pos,
1663 _("%s listed as string of invalid length %s "
1664 "in very long string record."),
1665 var_get_name (var), length_s);
1669 /* Check segments. */
1670 segment_cnt = sfm_width_to_segments (length);
1671 if (segment_cnt == 1)
1673 sys_warn (r, record->pos,
1674 _("%s listed in very long string record with width %s, "
1675 "which requires only one segment."),
1676 var_get_name (var), length_s);
1679 if (idx + segment_cnt > dict_get_var_cnt (dict))
1680 sys_error (r, record->pos,
1681 _("Very long string %s overflows dictionary."),
1682 var_get_name (var));
1684 /* Get the short names from the segments and check their
1686 for (i = 0; i < segment_cnt; i++)
1688 struct variable *seg = dict_get_var (dict, idx + i);
1689 int alloc_width = sfm_segment_alloc_width (length, i);
1690 int width = var_get_width (seg);
1693 var_set_short_name (var, i, var_get_short_name (seg, 0));
1694 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1695 sys_error (r, record->pos,
1696 _("Very long string with width %ld has segment %d "
1697 "of width %d (expected %d)."),
1698 length, i, width, alloc_width);
1700 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1701 var_set_width (var, length);
1703 close_text_record (r, text);
1704 dict_compact_values (dict);
1708 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1709 const struct sfm_var_record *var_recs, size_t n_var_recs,
1710 const struct sfm_value_label_record *record)
1712 struct variable **vars;
1716 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1717 for (i = 0; i < record->n_labels; i++)
1718 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1719 record->labels[i].label, -1,
1722 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1723 for (i = 0; i < record->n_vars; i++)
1724 vars[i] = lookup_var_by_index (r, record->pos,
1725 var_recs, n_var_recs, record->vars[i]);
1727 for (i = 1; i < record->n_vars; i++)
1728 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1729 sys_error (r, record->pos,
1730 _("Variables associated with value label are not all of "
1731 "identical type. Variable %s is %s, but variable "
1733 var_get_name (vars[0]),
1734 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1735 var_get_name (vars[i]),
1736 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1738 for (i = 0; i < record->n_vars; i++)
1740 struct variable *var = vars[i];
1744 width = var_get_width (var);
1746 sys_error (r, record->pos,
1747 _("Value labels may not be added to long string "
1748 "variables (e.g. %s) using records types 3 and 4."),
1749 var_get_name (var));
1751 for (j = 0; j < record->n_labels; j++)
1753 struct sfm_value_label *label = &record->labels[j];
1756 value_init (&value, width);
1758 value.f = parse_float (r, label->value, 0);
1760 memcpy (value_str_rw (&value, width), label->value, width);
1762 if (!var_add_value_label (var, &value, utf8_labels[j]))
1764 if (var_is_numeric (var))
1765 sys_warn (r, record->pos,
1766 _("Duplicate value label for %g on %s."),
1767 value.f, var_get_name (var));
1769 sys_warn (r, record->pos,
1770 _("Duplicate value label for `%.*s' on %s."),
1771 width, value_str (&value, width),
1772 var_get_name (var));
1775 value_destroy (&value, width);
1779 pool_free (r->pool, vars);
1780 for (i = 0; i < record->n_labels; i++)
1781 pool_free (r->pool, utf8_labels[i]);
1782 pool_free (r->pool, utf8_labels);
1785 static struct variable *
1786 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1787 const struct sfm_var_record *var_recs, size_t n_var_recs,
1790 const struct sfm_var_record *rec;
1792 if (idx < 1 || idx > n_var_recs)
1794 sys_error (r, offset,
1795 _("Variable index %d not in valid range 1...%zu."),
1800 rec = &var_recs[idx - 1];
1801 if (rec->var == NULL)
1803 sys_error (r, offset,
1804 _("Variable index %d refers to long string continuation."),
1812 /* Parses a set of custom attributes from TEXT into ATTRS.
1813 ATTRS may be a null pointer, in which case the attributes are
1814 read but discarded. */
1816 parse_attributes (struct sfm_reader *r, struct text_record *text,
1817 struct attrset *attrs)
1821 struct attribute *attr;
1825 /* Parse the key. */
1826 key = text_get_token (text, ss_cstr ("("), NULL);
1830 attr = attribute_create (key);
1831 for (index = 1; ; index++)
1833 /* Parse the value. */
1837 value = text_get_token (text, ss_cstr ("\n"), NULL);
1840 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1845 length = strlen (value);
1846 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1848 value[length - 1] = '\0';
1849 attribute_add_value (attr, value + 1);
1854 _("Attribute value %s[%d] is not quoted: %s."),
1856 attribute_add_value (attr, value);
1859 /* Was this the last value for this attribute? */
1860 if (text_match (text, ')'))
1864 attrset_add (attrs, attr);
1866 attribute_destroy (attr);
1868 while (!text_match (text, '/'));
1871 /* Reads record type 7, subtype 17, which lists custom
1872 attributes on the data file. */
1874 parse_data_file_attributes (struct sfm_reader *r,
1875 const struct sfm_extension_record *record,
1876 struct dictionary *dict)
1878 struct text_record *text = open_text_record (r, record, true);
1879 parse_attributes (r, text, dict_get_attributes (dict));
1880 close_text_record (r, text);
1883 /* Parses record type 7, subtype 18, which lists custom
1884 attributes on individual variables. */
1886 parse_variable_attributes (struct sfm_reader *r,
1887 const struct sfm_extension_record *record,
1888 struct dictionary *dict)
1890 struct text_record *text;
1891 struct variable *var;
1893 text = open_text_record (r, record, true);
1894 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1895 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1896 close_text_record (r, text);
1900 check_overflow (struct sfm_reader *r,
1901 const struct sfm_extension_record *record,
1902 size_t ofs, size_t length)
1904 size_t end = record->size * record->count;
1905 if (length >= end || ofs + length > end)
1906 sys_error (r, record->pos + end,
1907 _("Long string value label record ends unexpectedly."));
1911 parse_long_string_value_labels (struct sfm_reader *r,
1912 const struct sfm_extension_record *record,
1913 struct dictionary *dict)
1915 const char *dict_encoding = dict_get_encoding (dict);
1916 size_t end = record->size * record->count;
1923 struct variable *var;
1928 /* Parse variable name length. */
1929 check_overflow (r, record, ofs, 4);
1930 var_name_len = parse_int (r, record->data, ofs);
1933 /* Parse variable name, width, and number of labels. */
1934 check_overflow (r, record, ofs, var_name_len + 8);
1935 var_name = recode_string_pool ("UTF-8", dict_encoding,
1936 (const char *) record->data + ofs,
1937 var_name_len, r->pool);
1938 width = parse_int (r, record->data, ofs + var_name_len);
1939 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1940 ofs += var_name_len + 8;
1942 /* Look up 'var' and validate. */
1943 var = dict_lookup_var (dict, var_name);
1945 sys_warn (r, record->pos + ofs,
1946 _("Ignoring long string value record for "
1947 "unknown variable %s."), var_name);
1948 else if (var_is_numeric (var))
1950 sys_warn (r, record->pos + ofs,
1951 _("Ignoring long string value record for "
1952 "numeric variable %s."), var_name);
1955 else if (width != var_get_width (var))
1957 sys_warn (r, record->pos + ofs,
1958 _("Ignoring long string value record for variable %s "
1959 "because the record's width (%d) does not match the "
1960 "variable's width (%d)."),
1961 var_name, width, var_get_width (var));
1966 value_init_pool (r->pool, &value, width);
1967 for (i = 0; i < n_labels; i++)
1969 size_t value_length, label_length;
1970 bool skip = var == NULL;
1972 /* Parse value length. */
1973 check_overflow (r, record, ofs, 4);
1974 value_length = parse_int (r, record->data, ofs);
1978 check_overflow (r, record, ofs, value_length);
1981 if (value_length == width)
1982 memcpy (value_str_rw (&value, width),
1983 (const uint8_t *) record->data + ofs, width);
1986 sys_warn (r, record->pos + ofs,
1987 _("Ignoring long string value %zu for variable "
1988 "%s, with width %d, that has bad value "
1990 i, var_get_name (var), width, value_length);
1994 ofs += value_length;
1996 /* Parse label length. */
1997 check_overflow (r, record, ofs, 4);
1998 label_length = parse_int (r, record->data, ofs);
2002 check_overflow (r, record, ofs, label_length);
2007 label = recode_string_pool ("UTF-8", dict_encoding,
2008 (const char *) record->data + ofs,
2009 label_length, r->pool);
2010 if (!var_add_value_label (var, &value, label))
2011 sys_warn (r, record->pos + ofs,
2012 _("Duplicate value label for `%.*s' on %s."),
2013 width, value_str (&value, width),
2014 var_get_name (var));
2015 pool_free (r->pool, label);
2017 ofs += label_length;
2024 static void partial_record (struct sfm_reader *r)
2027 static void read_error (struct casereader *, const struct sfm_reader *);
2029 static bool read_case_number (struct sfm_reader *, double *);
2030 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
2031 static int read_opcode (struct sfm_reader *);
2032 static bool read_compressed_number (struct sfm_reader *, double *);
2033 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
2034 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2035 static bool skip_whole_strings (struct sfm_reader *, size_t);
2037 /* Reads and returns one case from READER's file. Returns a null
2038 pointer if not successful. */
2039 static struct ccase *
2040 sys_file_casereader_read (struct casereader *reader, void *r_)
2042 struct sfm_reader *r = r_;
2043 struct ccase *volatile c;
2049 c = case_create (r->proto);
2050 if (setjmp (r->bail_out))
2052 casereader_force_error (reader);
2057 for (i = 0; i < r->sfm_var_cnt; i++)
2059 struct sfm_var *sv = &r->sfm_vars[i];
2060 union value *v = case_data_rw_idx (c, sv->case_index);
2062 if (sv->var_width == 0)
2064 if (!read_case_number (r, &v->f))
2069 uint8_t *s = value_str_rw (v, sv->var_width);
2070 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2072 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2081 if (r->case_cnt != -1)
2082 read_error (reader, r);
2087 /* Issues an error that R ends in a partial record. */
2089 partial_record (struct sfm_reader *r)
2091 sys_error (r, r->pos, _("File ends in partial case."));
2094 /* Issues an error that an unspecified error occurred SFM, and
2097 read_error (struct casereader *r, const struct sfm_reader *sfm)
2099 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2100 casereader_force_error (r);
2103 /* Reads a number from R and stores its value in *D.
2104 If R is compressed, reads a compressed number;
2105 otherwise, reads a number in the regular way.
2106 Returns true if successful, false if end of file is
2107 reached immediately. */
2109 read_case_number (struct sfm_reader *r, double *d)
2114 if (!try_read_bytes (r, number, sizeof number))
2116 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2120 return read_compressed_number (r, d);
2123 /* Reads LENGTH string bytes from R into S.
2124 Always reads a multiple of 8 bytes; if LENGTH is not a
2125 multiple of 8, then extra bytes are read and discarded without
2127 Reads compressed strings if S is compressed.
2128 Returns true if successful, false if end of file is
2129 reached immediately. */
2131 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2133 size_t whole = ROUND_DOWN (length, 8);
2134 size_t partial = length % 8;
2138 if (!read_whole_strings (r, s, whole))
2145 if (!read_whole_strings (r, bounce, sizeof bounce))
2151 memcpy (s + whole, bounce, partial);
2157 /* Reads and returns the next compression opcode from R. */
2159 read_opcode (struct sfm_reader *r)
2161 assert (r->compressed);
2165 if (r->opcode_idx >= sizeof r->opcodes)
2167 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2171 opcode = r->opcodes[r->opcode_idx++];
2178 /* Reads a compressed number from R and stores its value in D.
2179 Returns true if successful, false if end of file is
2180 reached immediately. */
2182 read_compressed_number (struct sfm_reader *r, double *d)
2184 int opcode = read_opcode (r);
2192 *d = read_float (r);
2196 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2197 if (!r->corruption_warning)
2199 r->corruption_warning = true;
2200 sys_warn (r, r->pos,
2201 _("Possible compressed data corruption: "
2202 "compressed spaces appear in numeric field."));
2211 *d = opcode - r->bias;
2218 /* Reads a compressed 8-byte string segment from R and stores it
2220 Returns true if successful, false if end of file is
2221 reached immediately. */
2223 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2225 int opcode = read_opcode (r);
2233 read_bytes (r, dst, 8);
2237 memset (dst, ' ', 8);
2242 double value = opcode - r->bias;
2243 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2246 /* This has actually been seen "in the wild". The submitter of the
2247 file that showed that the contents decoded as spaces, but they
2248 were at the end of the field so it's possible that the null
2249 bytes just acted as null terminators. */
2251 else if (!r->corruption_warning)
2253 r->corruption_warning = true;
2254 sys_warn (r, r->pos,
2255 _("Possible compressed data corruption: "
2256 "string contains compressed integer (opcode %d)."),
2266 /* Reads LENGTH string bytes from R into S.
2267 LENGTH must be a multiple of 8.
2268 Reads compressed strings if S is compressed.
2269 Returns true if successful, false if end of file is
2270 reached immediately. */
2272 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2274 assert (length % 8 == 0);
2276 return try_read_bytes (r, s, length);
2280 for (ofs = 0; ofs < length; ofs += 8)
2281 if (!read_compressed_string (r, s + ofs))
2291 /* Skips LENGTH string bytes from R.
2292 LENGTH must be a multiple of 8.
2293 (LENGTH is also limited to 1024, but that's only because the
2294 current caller never needs more than that many bytes.)
2295 Returns true if successful, false if end of file is
2296 reached immediately. */
2298 skip_whole_strings (struct sfm_reader *r, size_t length)
2300 uint8_t buffer[1024];
2301 assert (length < sizeof buffer);
2302 return read_whole_strings (r, buffer, length);
2305 /* Helpers for reading records that contain structured text
2308 /* Maximum number of warnings to issue for a single text
2310 #define MAX_TEXT_WARNINGS 5
2315 struct substring buffer; /* Record contents. */
2316 off_t start; /* Starting offset in file. */
2317 size_t pos; /* Current position in buffer. */
2318 int n_warnings; /* Number of warnings issued or suppressed. */
2319 bool recoded; /* Recoded into UTF-8? */
2322 static struct text_record *
2323 open_text_record (struct sfm_reader *r,
2324 const struct sfm_extension_record *record,
2325 bool recode_to_utf8)
2327 struct text_record *text;
2328 struct substring raw;
2330 text = pool_alloc (r->pool, sizeof *text);
2331 raw = ss_buffer (record->data, record->size * record->count);
2332 text->start = record->pos;
2333 text->buffer = (recode_to_utf8
2334 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2337 text->n_warnings = 0;
2338 text->recoded = recode_to_utf8;
2343 /* Closes TEXT, frees its storage, and issues a final warning
2344 about suppressed warnings if necesary. */
2346 close_text_record (struct sfm_reader *r, struct text_record *text)
2348 if (text->n_warnings > MAX_TEXT_WARNINGS)
2349 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2350 text->n_warnings - MAX_TEXT_WARNINGS);
2352 pool_free (r->pool, ss_data (text->buffer));
2355 /* Reads a variable=value pair from TEXT.
2356 Looks up the variable in DICT and stores it into *VAR.
2357 Stores a null-terminated value into *VALUE. */
2359 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2360 struct text_record *text,
2361 struct variable **var, char **value)
2365 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2368 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2372 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2373 ss_buffer ("\t\0", 2));
2381 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2382 struct text_record *text, struct substring delimiters,
2383 struct variable **var)
2387 name = text_get_token (text, delimiters, NULL);
2391 *var = dict_lookup_var (dict, name);
2395 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2402 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2403 struct text_record *text, struct substring delimiters,
2404 struct variable **var)
2406 char *short_name = text_get_token (text, delimiters, NULL);
2407 if (short_name == NULL)
2410 *var = dict_lookup_var (dict, short_name);
2412 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2417 /* Displays a warning for the current file position, limiting the
2418 number to MAX_TEXT_WARNINGS for TEXT. */
2420 text_warn (struct sfm_reader *r, struct text_record *text,
2421 const char *format, ...)
2423 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2427 va_start (args, format);
2428 sys_msg (r, text->start + text->pos, MW, format, args);
2434 text_get_token (struct text_record *text, struct substring delimiters,
2437 struct substring token;
2440 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2443 end = &ss_data (token)[ss_length (token)];
2444 if (delimiter != NULL)
2447 return ss_data (token);
2450 /* Reads a integer value expressed in decimal, then a space, then a string that
2451 consists of exactly as many bytes as specified by the integer, then a space,
2452 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2453 buffer (so the caller should not free the string). */
2455 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2463 while (text->pos < text->buffer.length)
2465 int c = text->buffer.string[text->pos];
2466 if (c < '0' || c > '9')
2468 n = (n * 10) + (c - '0');
2471 if (text->pos >= text->buffer.length || start == text->pos)
2473 sys_warn (r, text->start,
2474 _("Expecting digit at offset %zu in MRSETS record."),
2479 if (!text_match (text, ' '))
2481 sys_warn (r, text->start,
2482 _("Expecting space at offset %zu in MRSETS record."),
2487 if (text->pos + n > text->buffer.length)
2489 sys_warn (r, text->start,
2490 _("%zu-byte string starting at offset %zu "
2491 "exceeds record length %zu."),
2492 n, text->pos, text->buffer.length);
2496 s = &text->buffer.string[text->pos];
2499 sys_warn (r, text->start,
2500 _("Expecting space at offset %zu following %zu-byte string."),
2510 text_match (struct text_record *text, char c)
2512 if (text->buffer.string[text->pos] == c)
2521 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2522 inside the TEXT's string. */
2524 text_pos (const struct text_record *text)
2531 /* Displays a corruption message. */
2533 sys_msg (struct sfm_reader *r, off_t offset,
2534 int class, const char *format, va_list args)
2539 ds_init_empty (&text);
2541 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2542 fh_get_file_name (r->fh), (long long int) offset);
2544 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2545 ds_put_vformat (&text, format, args);
2547 m.category = msg_class_to_category (class);
2548 m.severity = msg_class_to_severity (class);
2554 m.text = ds_cstr (&text);
2559 /* Displays a warning for offset OFFSET in the file. */
2561 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2565 va_start (args, format);
2566 sys_msg (r, offset, MW, format, args);
2570 /* Displays an error for the current file position,
2571 marks it as in an error state,
2572 and aborts reading it using longjmp. */
2574 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2578 va_start (args, format);
2579 sys_msg (r, offset, ME, format, args);
2583 longjmp (r->bail_out, 1);
2586 /* Reads BYTE_CNT bytes into BUF.
2587 Returns true if exactly BYTE_CNT bytes are successfully read.
2588 Aborts if an I/O error or a partial read occurs.
2589 If EOF_IS_OK, then an immediate end-of-file causes false to be
2590 returned; otherwise, immediate end-of-file causes an abort
2593 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2594 void *buf, size_t byte_cnt)
2596 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2597 r->pos += bytes_read;
2598 if (bytes_read == byte_cnt)
2600 else if (ferror (r->file))
2601 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2602 else if (!eof_is_ok || bytes_read != 0)
2603 sys_error (r, r->pos, _("Unexpected end of file."));
2608 /* Reads BYTE_CNT into BUF.
2609 Aborts upon I/O error or if end-of-file is encountered. */
2611 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2613 read_bytes_internal (r, false, buf, byte_cnt);
2616 /* Reads BYTE_CNT bytes into BUF.
2617 Returns true if exactly BYTE_CNT bytes are successfully read.
2618 Returns false if an immediate end-of-file is encountered.
2619 Aborts if an I/O error or a partial read occurs. */
2621 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2623 return read_bytes_internal (r, true, buf, byte_cnt);
2626 /* Reads a 32-bit signed integer from R and returns its value in
2629 read_int (struct sfm_reader *r)
2632 read_bytes (r, integer, sizeof integer);
2633 return integer_get (r->integer_format, integer, sizeof integer);
2636 /* Reads a 64-bit floating-point number from R and returns its
2637 value in host format. */
2639 read_float (struct sfm_reader *r)
2642 read_bytes (r, number, sizeof number);
2643 return float_get_double (r->float_format, number);
2647 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2649 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2653 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2655 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2658 /* Reads exactly SIZE - 1 bytes into BUFFER
2659 and stores a null byte into BUFFER[SIZE - 1]. */
2661 read_string (struct sfm_reader *r, char *buffer, size_t size)
2664 read_bytes (r, buffer, size - 1);
2665 buffer[size - 1] = '\0';
2668 /* Skips BYTES bytes forward in R. */
2670 skip_bytes (struct sfm_reader *r, size_t bytes)
2675 size_t chunk = MIN (sizeof buffer, bytes);
2676 read_bytes (r, buffer, chunk);
2681 static const struct casereader_class sys_file_casereader_class =
2683 sys_file_casereader_read,
2684 sys_file_casereader_destroy,