1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
95 int missing_value_code;
101 struct sfm_value_label
107 struct sfm_value_label_record
110 struct sfm_value_label *labels;
117 struct sfm_document_record
124 struct sfm_extension_record
126 off_t pos; /* Starting offset in file. */
127 size_t size; /* Size of data elements. */
128 size_t count; /* Number of data elements. */
129 void *data; /* Contents. */
132 /* System file reader. */
135 /* Resource tracking. */
136 struct pool *pool; /* All system file state. */
137 jmp_buf bail_out; /* longjmp() target for error handling. */
140 struct file_handle *fh; /* File handle. */
141 struct fh_lock *lock; /* Mutual exclusion for file handle. */
142 FILE *file; /* File stream. */
143 off_t pos; /* Position in file. */
144 bool error; /* I/O or corruption error? */
145 struct caseproto *proto; /* Format of output cases. */
148 enum integer_format integer_format; /* On-disk integer format. */
149 enum float_format float_format; /* On-disk floating point format. */
150 struct sfm_var *sfm_vars; /* Variables. */
151 size_t sfm_var_cnt; /* Number of variables. */
152 casenumber case_cnt; /* Number of cases */
153 const char *encoding; /* String encoding. */
156 bool compressed; /* File is compressed? */
157 double bias; /* Compression bias, usually 100.0. */
158 uint8_t opcodes[8]; /* Current block of opcodes. */
159 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
160 bool corruption_warning; /* Warned about possible corruption? */
163 static const struct casereader_class sys_file_casereader_class;
165 static bool close_reader (struct sfm_reader *);
167 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
168 const struct sfm_var_record *,
171 static void sys_msg (struct sfm_reader *r, off_t, int class,
172 const char *format, va_list args)
173 PRINTF_FORMAT (4, 0);
174 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
175 PRINTF_FORMAT (3, 4);
176 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
180 static void read_bytes (struct sfm_reader *, void *, size_t);
181 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
182 static int read_int (struct sfm_reader *);
183 static double read_float (struct sfm_reader *);
184 static void read_string (struct sfm_reader *, char *, size_t);
185 static void skip_bytes (struct sfm_reader *, size_t);
187 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
188 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
190 static void read_variable_record (struct sfm_reader *,
191 struct sfm_var_record *);
192 static void read_value_label_record (struct sfm_reader *,
193 struct sfm_value_label_record *,
195 static struct sfm_document_record *read_document_record (struct sfm_reader *);
196 static struct sfm_extension_record *read_extension_record (
197 struct sfm_reader *, int subtype);
198 static void skip_extension_record (struct sfm_reader *, int subtype);
200 static const char *choose_encoding (
202 const struct sfm_extension_record *ext_integer,
203 const struct sfm_extension_record *ext_encoding);
205 static struct text_record *open_text_record (
206 struct sfm_reader *, const struct sfm_extension_record *,
207 bool recode_to_utf8);
208 static void close_text_record (struct sfm_reader *,
209 struct text_record *);
210 static bool read_variable_to_value_pair (struct sfm_reader *,
212 struct text_record *,
213 struct variable **var, char **value);
214 static void text_warn (struct sfm_reader *r, struct text_record *text,
215 const char *format, ...)
216 PRINTF_FORMAT (3, 4);
217 static char *text_get_token (struct text_record *,
218 struct substring delimiters, char *delimiter);
219 static bool text_match (struct text_record *, char c);
220 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
221 struct text_record *,
222 struct substring delimiters,
224 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
225 struct text_record *,
226 struct substring delimiters,
228 static const char *text_parse_counted_string (struct sfm_reader *,
229 struct text_record *);
230 static size_t text_pos (const struct text_record *);
232 static bool close_reader (struct sfm_reader *r);
234 /* Dictionary reader. */
242 static void read_header (struct sfm_reader *, int *weight_idx,
243 int *claimed_oct_cnt, struct sfm_read_info *,
245 static void parse_file_label (struct sfm_reader *, const char *file_label,
246 struct dictionary *);
247 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
248 struct sfm_var_record *, size_t n);
249 static void parse_format_spec (struct sfm_reader *, off_t pos,
250 unsigned int format, enum which_format,
251 struct variable *, int *format_warning_cnt);
252 static void parse_document (struct dictionary *, struct sfm_document_record *);
253 static void parse_display_parameters (struct sfm_reader *,
254 const struct sfm_extension_record *,
255 struct dictionary *);
256 static void parse_machine_integer_info (struct sfm_reader *,
257 const struct sfm_extension_record *,
258 struct sfm_read_info *);
259 static void parse_machine_float_info (struct sfm_reader *,
260 const struct sfm_extension_record *);
261 static void parse_mrsets (struct sfm_reader *,
262 const struct sfm_extension_record *,
263 struct dictionary *);
264 static void parse_long_var_name_map (struct sfm_reader *,
265 const struct sfm_extension_record *,
266 struct dictionary *);
267 static void parse_long_string_map (struct sfm_reader *,
268 const struct sfm_extension_record *,
269 struct dictionary *);
270 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
271 const struct sfm_var_record *,
273 const struct sfm_value_label_record *);
274 static void parse_data_file_attributes (struct sfm_reader *,
275 const struct sfm_extension_record *,
276 struct dictionary *);
277 static void parse_variable_attributes (struct sfm_reader *,
278 const struct sfm_extension_record *,
279 struct dictionary *);
280 static void parse_long_string_value_labels (struct sfm_reader *,
281 const struct sfm_extension_record *,
282 struct dictionary *);
284 /* Opens the system file designated by file handle FH for
285 reading. Reads the system file's dictionary into *DICT.
286 If INFO is non-null, then it receives additional info about the
289 sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
290 struct sfm_read_info *volatile info)
292 struct sfm_reader *volatile r = NULL;
293 struct sfm_read_info local_info;
295 struct sfm_var_record *vars;
296 size_t n_vars, allocated_vars;
298 struct sfm_value_label_record *labels;
299 size_t n_labels, allocated_labels;
301 struct sfm_document_record *document;
303 struct sfm_extension_record *extensions[32];
309 struct dictionary *dict = NULL;
312 /* Create and initialize reader. */
313 r = pool_create_container (struct sfm_reader, pool);
319 r->opcode_idx = sizeof r->opcodes;
320 r->corruption_warning = false;
322 /* TRANSLATORS: this fragment will be interpolated into
323 messages in fh_lock() that identify types of files. */
324 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
328 r->file = fn_open (fh_get_file_name (fh), "rb");
331 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
332 fh_get_file_name (r->fh), strerror (errno));
336 /* Initialize info. */
339 memset (info, 0, sizeof *info);
341 if (setjmp (r->bail_out))
345 read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
348 n_vars = allocated_vars = 0;
351 n_labels = allocated_labels = 0;
355 memset (extensions, 0, sizeof extensions);
365 read_int (r); /* Skip filler. */
372 if (n_vars >= allocated_vars)
373 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
375 read_variable_record (r, &vars[n_vars++]);
379 if (n_labels >= allocated_labels)
380 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
382 read_value_label_record (r, &labels[n_labels++], n_vars);
386 /* A Type 4 record is always immediately after a type 3 record,
387 so the code for type 3 records reads the type 4 record too. */
388 sys_error (r, r->pos, _("Misplaced type 4 record."));
391 if (document != NULL)
392 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
393 document = read_document_record (r);
397 subtype = read_int (r);
398 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
401 _("Unrecognized record type 7, subtype %d. Please "
402 "send a copy of this file, and the syntax which "
403 "created it to %s."),
404 subtype, PACKAGE_BUGREPORT);
405 skip_extension_record (r, subtype);
407 else if (extensions[subtype] != NULL)
410 _("Record type 7, subtype %d found here has the same "
411 "type as the record found near offset 0x%llx. "
412 "Please send a copy of this file, and the syntax "
413 "which created it to %s."),
414 subtype, (long long int) extensions[subtype]->pos,
416 skip_extension_record (r, subtype);
419 extensions[subtype] = read_extension_record (r, subtype);
423 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
428 /* Now actually parse what we read.
430 First, figure out the correct character encoding, because this determines
431 how the rest of the header data is to be interpreted. */
432 dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
433 extensions[EXT_ENCODING]));
434 r->encoding = dict_get_encoding (dict);
436 /* These records don't use variables at all. */
437 if (document != NULL)
438 parse_document (dict, document);
440 if (extensions[EXT_INTEGER] != NULL)
441 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
443 if (extensions[EXT_FLOAT] != NULL)
444 parse_machine_float_info (r, extensions[EXT_FLOAT]);
446 if (extensions[EXT_FILE_ATTRS] != NULL)
447 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
449 parse_file_label (r, file_label, dict);
451 /* Parse the variable records, the basis of almost everything else. */
452 parse_variable_records (r, dict, vars, n_vars);
454 /* Parse value labels and the weight variable immediately after the variable
455 records. These records use indexes into var_recs[], so we must parse them
456 before those indexes become invalidated by very long string variables. */
457 for (i = 0; i < n_labels; i++)
458 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
461 struct variable *weight_var;
463 weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
464 if (var_is_numeric (weight_var))
465 dict_set_weight (dict, weight_var);
467 sys_error (r, -1, _("Weighting variable must be numeric "
468 "(not string variable `%s')."),
469 var_get_name (weight_var));
472 if (extensions[EXT_DISPLAY] != NULL)
473 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
475 /* The following records use short names, so they need to be parsed before
476 parse_long_var_name_map() changes short names to long names. */
477 if (extensions[EXT_MRSETS] != NULL)
478 parse_mrsets (r, extensions[EXT_MRSETS], dict);
480 if (extensions[EXT_MRSETS2] != NULL)
481 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
483 if (extensions[EXT_LONG_STRINGS] != NULL)
484 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
486 /* Now rename variables to their long names. */
487 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
489 /* The following records use long names, so they need to follow renaming. */
490 if (extensions[EXT_VAR_ATTRS] != NULL)
491 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
493 if (extensions[EXT_LONG_LABELS] != NULL)
494 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
496 /* Warn if the actual amount of data per case differs from the
497 amount that the header claims. SPSS version 13 gets this
498 wrong when very long strings are involved, so don't warn in
500 if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
501 && info->version_major != 13)
502 sys_warn (r, -1, _("File header claims %d variable positions but "
503 "%zu were read from file."),
504 claimed_oct_cnt, n_vars);
506 /* Create an index of dictionary variable widths for
507 sfm_read_case to use. We cannot use the `struct variable's
508 from the dictionary we created, because the caller owns the
509 dictionary and may destroy or modify its variables. */
510 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
511 pool_register (r->pool, free, r->sfm_vars);
512 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
515 return casereader_create_sequential
517 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
518 &sys_file_casereader_class, r);
527 /* Closes a system file after we're done with it.
528 Returns true if an I/O error has occurred on READER, false
531 close_reader (struct sfm_reader *r)
540 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
542 msg (ME, _("Error closing system file `%s': %s."),
543 fh_get_file_name (r->fh), strerror (errno));
553 pool_destroy (r->pool);
558 /* Destroys READER. */
560 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
562 struct sfm_reader *r = r_;
566 /* Returns true if FILE is an SPSS system file,
569 sfm_detect (FILE *file)
573 if (fread (rec_type, 4, 1, file) != 1)
577 return !strcmp ("$FL2", rec_type);
580 /* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the
581 system file is unweighted, or to the value index of the weight variable
582 otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
583 case that the file claims to have (although it is not always correct).
584 Initializes INFO with header information. Stores the file label as a string
585 in dictionary encoding into *FILE_LABELP. */
587 read_header (struct sfm_reader *r, int *weight_idx,
588 int *claimed_oct_cnt, struct sfm_read_info *info,
592 char eye_catcher[61];
593 uint8_t raw_layout_code[4];
595 char creation_date[10];
596 char creation_time[9];
598 struct substring product;
600 read_string (r, rec_type, sizeof rec_type);
601 read_string (r, eye_catcher, sizeof eye_catcher);
603 if (strcmp ("$FL2", rec_type) != 0)
604 sys_error (r, 0, _("This is not an SPSS system file."));
606 /* Identify integer format. */
607 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
608 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
610 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
612 || (r->integer_format != INTEGER_MSB_FIRST
613 && r->integer_format != INTEGER_LSB_FIRST))
614 sys_error (r, 64, _("This is not an SPSS system file."));
616 *claimed_oct_cnt = read_int (r);
617 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
618 *claimed_oct_cnt = -1;
620 r->compressed = read_int (r) != 0;
622 *weight_idx = read_int (r);
624 r->case_cnt = read_int (r);
625 if ( r->case_cnt > INT_MAX / 2)
628 /* Identify floating-point format and obtain compression bias. */
629 read_bytes (r, raw_bias, sizeof raw_bias);
630 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
632 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
634 if (memcmp (raw_bias, zero_bias, 8))
635 sys_warn (r, r->pos - 8,
636 _("Compression bias is not the usual "
637 "value of 100, or system file uses unrecognized "
638 "floating-point format."));
641 /* Some software is known to write all-zeros to this
642 field. Such software also writes floating-point
643 numbers in the format that we expect by default
644 (it seems that all software most likely does, in
645 reality), so don't warn in this case. */
648 if (r->integer_format == INTEGER_MSB_FIRST)
649 r->float_format = FLOAT_IEEE_DOUBLE_BE;
651 r->float_format = FLOAT_IEEE_DOUBLE_LE;
653 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
655 read_string (r, creation_date, sizeof creation_date);
656 read_string (r, creation_time, sizeof creation_time);
657 read_string (r, file_label, sizeof file_label);
660 strcpy (info->creation_date, creation_date);
661 strcpy (info->creation_time, creation_time);
662 info->integer_format = r->integer_format;
663 info->float_format = r->float_format;
664 info->compressed = r->compressed;
665 info->case_cnt = r->case_cnt;
667 product = ss_cstr (eye_catcher);
668 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
669 ss_trim (&product, ss_cstr (" "));
670 str_copy_buf_trunc (info->product, sizeof info->product,
671 ss_data (product), ss_length (product));
673 *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
676 /* Reads a variable (type 2) record from R into RECORD. */
678 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
680 int has_variable_label;
682 memset (record, 0, sizeof *record);
684 record->pos = r->pos;
685 record->width = read_int (r);
686 has_variable_label = read_int (r);
687 record->missing_value_code = read_int (r);
688 record->print_format = read_int (r);
689 record->write_format = read_int (r);
690 read_bytes (r, record->name, sizeof record->name);
692 if (has_variable_label == 1)
694 enum { MAX_LABEL_LEN = 255 };
695 size_t len, read_len;
699 /* Read up to MAX_LABEL_LEN bytes of label. */
700 read_len = MIN (MAX_LABEL_LEN, len);
701 record->label = xmalloc (read_len + 1);
702 read_string (r, record->label, read_len + 1);
704 /* Skip unread label bytes. */
705 skip_bytes (r, len - read_len);
707 /* Skip label padding up to multiple of 4 bytes. */
708 skip_bytes (r, ROUND_UP (len, 4) - len);
710 else if (has_variable_label != 0)
711 sys_error (r, record->pos,
712 _("Variable label indicator field is not 0 or 1."));
714 /* Set missing values. */
715 if (record->missing_value_code != 0)
717 int code = record->missing_value_code;
718 if (record->width == 0)
720 if (code < -3 || code > 3 || code == -1)
721 sys_error (r, record->pos,
722 _("Numeric missing value indicator field is not "
723 "-3, -2, 0, 1, 2, or 3."));
727 if (code < 1 || code > 3)
728 sys_error (r, record->pos,
729 _("String missing value indicator field is not "
733 read_bytes (r, record->missing, 8 * abs (code));
737 /* Reads value labels from R into RECORD. */
739 read_value_label_record (struct sfm_reader *r,
740 struct sfm_value_label_record *record,
745 /* Read type 3 record. */
746 record->pos = r->pos;
747 record->n_labels = read_int (r);
748 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
749 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
751 record->labels = pool_nmalloc (r->pool, record->n_labels,
752 sizeof *record->labels);
753 for (i = 0; i < record->n_labels; i++)
755 struct sfm_value_label *label = &record->labels[i];
756 unsigned char label_len;
759 read_bytes (r, label->value, sizeof label->value);
761 /* Read label length. */
762 read_bytes (r, &label_len, sizeof label_len);
763 padded_len = ROUND_UP (label_len + 1, 8);
765 /* Read label, padding. */
766 label->label = pool_malloc (r->pool, padded_len + 1);
767 read_bytes (r, label->label, padded_len - 1);
768 label->label[label_len] = '\0';
771 /* Read record type of type 4 record. */
772 if (read_int (r) != 4)
773 sys_error (r, r->pos - 4,
774 _("Variable index record (type 4) does not immediately "
775 "follow value label record (type 3) as it should."));
777 /* Read number of variables associated with value label from type 4
779 record->n_vars = read_int (r);
780 if (record->n_vars < 1 || record->n_vars > n_vars)
781 sys_error (r, r->pos - 4,
782 _("Number of variables associated with a value label (%zu) "
783 "is not between 1 and the number of variables (%zu)."),
784 record->n_vars, n_vars);
785 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
786 for (i = 0; i < record->n_vars; i++)
787 record->vars[i] = read_int (r);
790 /* Reads a document record from R and returns it. */
791 static struct sfm_document_record *
792 read_document_record (struct sfm_reader *r)
794 struct sfm_document_record *record;
797 record = pool_malloc (r->pool, sizeof *record);
798 record->pos = r->pos;
800 n_lines = read_int (r);
801 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
802 sys_error (r, record->pos,
803 _("Number of document lines (%d) "
804 "must be greater than 0 and less than %d."),
805 n_lines, INT_MAX / DOC_LINE_LENGTH);
807 record->n_lines = n_lines;
808 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
809 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
815 read_extension_record_header (struct sfm_reader *r, int subtype,
816 struct sfm_extension_record *record)
818 record->pos = r->pos;
819 record->size = read_int (r);
820 record->count = read_int (r);
822 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
823 allows an extra byte for a null terminator, used by some
824 extension processing routines. */
825 if (record->size != 0
826 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
827 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
830 /* Reads an extension record from R into RECORD. */
831 static struct sfm_extension_record *
832 read_extension_record (struct sfm_reader *r, int subtype)
834 struct extension_record_type
841 static const struct extension_record_type types[] =
843 /* Implemented record types. */
844 { EXT_INTEGER, 4, 8 },
846 { EXT_MRSETS, 1, 0 },
847 { EXT_DISPLAY, 4, 0 },
848 { EXT_LONG_NAMES, 1, 0 },
849 { EXT_LONG_STRINGS, 1, 0 },
850 { EXT_NCASES, 8, 2 },
851 { EXT_FILE_ATTRS, 1, 0 },
852 { EXT_VAR_ATTRS, 1, 0 },
853 { EXT_MRSETS2, 1, 0 },
854 { EXT_ENCODING, 1, 0 },
855 { EXT_LONG_LABELS, 1, 0 },
857 /* Ignored record types. */
858 { EXT_VAR_SETS, 0, 0 },
860 { EXT_DATA_ENTRY, 0, 0 },
863 const struct extension_record_type *type;
864 struct sfm_extension_record *record;
867 record = pool_malloc (r->pool, sizeof *record);
868 read_extension_record_header (r, subtype, record);
869 n_bytes = record->count * record->size;
871 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
872 if (subtype == type->subtype)
874 if (type->size > 0 && record->size != type->size)
875 sys_warn (r, record->pos,
876 _("Record type 7, subtype %d has bad size %zu "
877 "(expected %d)."), subtype, record->size, type->size);
878 else if (type->count > 0 && record->count != type->count)
879 sys_warn (r, record->pos,
880 _("Record type 7, subtype %d has bad count %zu "
881 "(expected %d)."), subtype, record->count, type->count);
882 else if (type->count == 0 && type->size == 0)
884 /* Ignore this record. */
888 char *data = pool_malloc (r->pool, n_bytes + 1);
889 data[n_bytes] = '\0';
892 read_bytes (r, record->data, n_bytes);
899 sys_warn (r, record->pos,
900 _("Unrecognized record type 7, subtype %d. Please send a "
901 "copy of this file, and the syntax which created it to %s."),
902 subtype, PACKAGE_BUGREPORT);
905 skip_bytes (r, n_bytes);
910 skip_extension_record (struct sfm_reader *r, int subtype)
912 struct sfm_extension_record record;
914 read_extension_record_header (r, subtype, &record);
915 skip_bytes (r, record.count * record.size);
919 parse_file_label (struct sfm_reader *r, const char *file_label,
920 struct dictionary *dict)
922 char *utf8_file_label;
923 size_t file_label_len;
925 utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
926 file_label, -1, r->pool);
927 file_label_len = strlen (utf8_file_label);
928 while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
930 utf8_file_label[file_label_len] = '\0';
931 dict_set_label (dict, utf8_file_label);
934 /* Reads a variable (type 2) record from R and adds the
935 corresponding variable to DICT.
936 Also skips past additional variable records for long string
939 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
940 struct sfm_var_record *var_recs, size_t n_var_recs)
942 const char *dict_encoding = dict_get_encoding (dict);
943 struct sfm_var_record *rec;
946 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
948 struct variable *var;
953 name = recode_string_pool ("UTF-8", dict_encoding,
954 rec->name, 8, r->pool);
955 name[strcspn (name, " ")] = '\0';
957 if (!dict_id_is_valid (dict, name, false)
958 || name[0] == '$' || name[0] == '#')
959 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
961 if (rec->width < 0 || rec->width > 255)
962 sys_error (r, rec->pos,
963 _("Bad width %d for variable %s."), rec->width, name);
965 var = rec->var = dict_create_var (dict, name, rec->width);
967 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
969 /* Set the short name the same as the long name. */
970 var_set_short_name (var, 0, name);
972 /* Get variable label, if any. */
977 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
978 rec->label, -1, r->pool);
979 var_set_label (var, utf8_label, false);
982 /* Set missing values. */
983 if (rec->missing_value_code != 0)
985 int width = var_get_width (var);
986 struct missing_values mv;
988 mv_init_pool (r->pool, &mv, width);
989 if (var_is_numeric (var))
991 bool has_range = rec->missing_value_code < 0;
992 int n_discrete = (has_range
993 ? rec->missing_value_code == -3
994 : rec->missing_value_code);
999 double low = parse_float (r, rec->missing, 0);
1000 double high = parse_float (r, rec->missing, 8);
1001 mv_add_range (&mv, low, high);
1005 for (i = 0; i < n_discrete; i++)
1007 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1015 value_init_pool (r->pool, &value, width);
1016 value_set_missing (&value, width);
1017 for (i = 0; i < rec->missing_value_code; i++)
1019 uint8_t *s = value_str_rw (&value, width);
1020 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1021 mv_add_str (&mv, s);
1024 var_set_missing_values (var, &mv);
1028 parse_format_spec (r, rec->pos + 12, rec->print_format,
1029 PRINT_FORMAT, var, &n_warnings);
1030 parse_format_spec (r, rec->pos + 16, rec->write_format,
1031 WRITE_FORMAT, var, &n_warnings);
1033 /* Account for values.
1034 Skip long string continuation records, if any. */
1035 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1036 for (i = 1; i < n_values; i++)
1037 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1038 sys_error (r, rec->pos, _("Missing string continuation record."));
1043 /* Translates the format spec from sysfile format to internal
1046 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1047 enum which_format which, struct variable *v,
1050 const int max_warnings = 8;
1051 uint8_t raw_type = format >> 16;
1052 uint8_t w = format >> 8;
1061 ok = (fmt_from_io (raw_type, &f.type)
1062 && fmt_check_output (&f)
1063 && fmt_check_width_compat (&f, var_get_width (v)));
1068 if (which == PRINT_FORMAT)
1069 var_set_print_format (v, &f);
1071 var_set_write_format (v, &f);
1073 else if (format == 0)
1075 /* Actually observed in the wild. No point in warning about it. */
1077 else if (++*n_warnings <= max_warnings)
1079 if (which == PRINT_FORMAT)
1080 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1082 var_get_name (v), var_get_width (v), format);
1084 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1086 var_get_name (v), var_get_width (v), format);
1088 if (*n_warnings == max_warnings)
1089 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1094 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1098 for (p = record->documents;
1099 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1100 p += DOC_LINE_LENGTH)
1102 struct substring line;
1104 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1105 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1106 ss_rtrim (&line, ss_cstr (" "));
1107 line.string[line.length] = '\0';
1109 dict_add_document_line (dict, line.string, false);
1115 /* Parses record type 7, subtype 3. */
1117 parse_machine_integer_info (struct sfm_reader *r,
1118 const struct sfm_extension_record *record,
1119 struct sfm_read_info *info)
1121 int float_representation, expected_float_format;
1122 int integer_representation, expected_integer_format;
1124 /* Save version info. */
1125 info->version_major = parse_int (r, record->data, 0);
1126 info->version_minor = parse_int (r, record->data, 4);
1127 info->version_revision = parse_int (r, record->data, 8);
1129 /* Check floating point format. */
1130 float_representation = parse_int (r, record->data, 16);
1131 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1132 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1133 expected_float_format = 1;
1134 else if (r->float_format == FLOAT_Z_LONG)
1135 expected_float_format = 2;
1136 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1137 expected_float_format = 3;
1140 if (float_representation != expected_float_format)
1141 sys_error (r, record->pos, _("Floating-point representation indicated by "
1142 "system file (%d) differs from expected (%d)."),
1143 float_representation, expected_float_format);
1145 /* Check integer format. */
1146 integer_representation = parse_int (r, record->data, 24);
1147 if (r->integer_format == INTEGER_MSB_FIRST)
1148 expected_integer_format = 1;
1149 else if (r->integer_format == INTEGER_LSB_FIRST)
1150 expected_integer_format = 2;
1153 if (integer_representation != expected_integer_format)
1154 sys_warn (r, record->pos,
1155 _("Integer format indicated by system file (%d) "
1156 "differs from expected (%d)."),
1157 integer_representation, expected_integer_format);
1162 choose_encoding (struct sfm_reader *r,
1163 const struct sfm_extension_record *ext_integer,
1164 const struct sfm_extension_record *ext_encoding)
1166 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1169 return ext_encoding->data;
1171 /* But EXT_INTEGER is better than nothing as a fallback. */
1174 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1175 const char *encoding;
1184 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1185 respectively. However, there are known to be many files in the wild
1186 with character code 2, yet have data which are clearly not ASCII.
1187 Therefore we ignore these values. */
1194 encoding = sys_get_encoding_from_codepage (codepage);
1195 if (encoding != NULL)
1201 return locale_charset ();
1204 /* Parses record type 7, subtype 4. */
1206 parse_machine_float_info (struct sfm_reader *r,
1207 const struct sfm_extension_record *record)
1209 double sysmis = parse_float (r, record->data, 0);
1210 double highest = parse_float (r, record->data, 8);
1211 double lowest = parse_float (r, record->data, 16);
1213 if (sysmis != SYSMIS)
1214 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1217 if (highest != HIGHEST)
1218 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1219 highest, "HIGHEST");
1221 if (lowest != LOWEST)
1222 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1226 /* Parses record type 7, subtype 7 or 19. */
1228 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1229 struct dictionary *dict)
1231 struct text_record *text;
1232 struct mrset *mrset;
1234 text = open_text_record (r, record, false);
1237 const char *counted = NULL;
1240 struct stringi_set var_names;
1241 size_t allocated_vars;
1245 mrset = xzalloc (sizeof *mrset);
1247 name = text_get_token (text, ss_cstr ("="), NULL);
1250 mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
1252 if (mrset->name[0] != '$')
1254 sys_warn (r, record->pos,
1255 _("`%s' does not begin with `$' at offset %zu "
1256 "in MRSETS record."), mrset->name, text_pos (text));
1260 if (text_match (text, 'C'))
1262 mrset->type = MRSET_MC;
1263 if (!text_match (text, ' '))
1265 sys_warn (r, record->pos,
1266 _("Missing space following `%c' at offset %zu "
1267 "in MRSETS record."), 'C', text_pos (text));
1271 else if (text_match (text, 'D'))
1273 mrset->type = MRSET_MD;
1274 mrset->cat_source = MRSET_VARLABELS;
1276 else if (text_match (text, 'E'))
1280 mrset->type = MRSET_MD;
1281 mrset->cat_source = MRSET_COUNTEDVALUES;
1282 if (!text_match (text, ' '))
1284 sys_warn (r, record->pos,
1285 _("Missing space following `%c' at offset %zu "
1286 "in MRSETS record."), 'E', text_pos (text));
1290 number = text_get_token (text, ss_cstr (" "), NULL);
1291 if (!strcmp (number, "11"))
1292 mrset->label_from_var_label = true;
1293 else if (strcmp (number, "1"))
1294 sys_warn (r, record->pos,
1295 _("Unexpected label source value `%s' following `E' "
1296 "at offset %zu in MRSETS record."),
1297 number, text_pos (text));
1301 sys_warn (r, record->pos,
1302 _("Missing `C', `D', or `E' at offset %zu "
1303 "in MRSETS record."),
1308 if (mrset->type == MRSET_MD)
1310 counted = text_parse_counted_string (r, text);
1311 if (counted == NULL)
1315 label = text_parse_counted_string (r, text);
1318 if (label[0] != '\0')
1319 mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
1321 stringi_set_init (&var_names);
1326 const char *raw_var_name;
1327 struct variable *var;
1330 raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1331 if (raw_var_name == NULL)
1333 sys_warn (r, record->pos,
1334 _("Missing new-line parsing variable names "
1335 "at offset %zu in MRSETS record."),
1339 var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
1341 var = dict_lookup_var (dict, var_name);
1347 if (!stringi_set_insert (&var_names, var_name))
1349 sys_warn (r, record->pos,
1350 _("Duplicate variable name %s "
1351 "at offset %zu in MRSETS record."),
1352 var_name, text_pos (text));
1358 if (mrset->label == NULL && mrset->label_from_var_label
1359 && var_has_label (var))
1360 mrset->label = xstrdup (var_get_label (var));
1363 && var_get_type (var) != var_get_type (mrset->vars[0]))
1365 sys_warn (r, record->pos,
1366 _("MRSET %s contains both string and "
1367 "numeric variables."), name);
1370 width = MIN (width, var_get_width (var));
1372 if (mrset->n_vars >= allocated_vars)
1373 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1374 sizeof *mrset->vars);
1375 mrset->vars[mrset->n_vars++] = var;
1377 while (delimiter != '\n');
1379 if (mrset->n_vars < 2)
1381 sys_warn (r, record->pos,
1382 _("MRSET %s has only %zu variables."), mrset->name,
1384 mrset_destroy (mrset);
1388 if (mrset->type == MRSET_MD)
1390 mrset->width = width;
1391 value_init (&mrset->counted, width);
1393 mrset->counted.f = strtod (counted, NULL);
1395 value_copy_str_rpad (&mrset->counted, width,
1396 (const uint8_t *) counted, ' ');
1399 dict_add_mrset (dict, mrset);
1401 stringi_set_destroy (&var_names);
1403 mrset_destroy (mrset);
1404 close_text_record (r, text);
1407 /* Read record type 7, subtype 11, which specifies how variables
1408 should be displayed in GUI environments. */
1410 parse_display_parameters (struct sfm_reader *r,
1411 const struct sfm_extension_record *record,
1412 struct dictionary *dict)
1414 bool includes_width;
1415 bool warned = false;
1420 n_vars = dict_get_var_cnt (dict);
1421 if (record->count == 3 * n_vars)
1422 includes_width = true;
1423 else if (record->count == 2 * n_vars)
1424 includes_width = false;
1427 sys_warn (r, record->pos,
1428 _("Extension 11 has bad count %zu (for %zu variables)."),
1429 record->count, n_vars);
1434 for (i = 0; i < n_vars; ++i)
1436 struct variable *v = dict_get_var (dict, i);
1437 int measure, width, align;
1439 measure = parse_int (r, record->data, ofs);
1444 width = parse_int (r, record->data, ofs);
1450 align = parse_int (r, record->data, ofs);
1453 /* SPSS 14 sometimes seems to set string variables' measure
1455 if (0 == measure && var_is_alpha (v))
1458 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1461 sys_warn (r, record->pos,
1462 _("Invalid variable display parameters for variable "
1463 "%zu (%s). Default parameters substituted."),
1464 i, var_get_name (v));
1469 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1470 : measure == 2 ? MEASURE_ORDINAL
1472 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1473 : align == 1 ? ALIGN_RIGHT
1476 /* Older versions (SPSS 9.0) sometimes set the display
1477 width to zero. This causes confusion in the GUI, so
1478 only set the width if it is nonzero. */
1480 var_set_display_width (v, width);
1485 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1486 const char *new_name)
1488 size_t n_short_names;
1492 /* Renaming a variable may clear its short names, but we
1493 want to retain them, so we save them and re-set them
1495 n_short_names = var_get_short_name_cnt (var);
1496 short_names = xnmalloc (n_short_names, sizeof *short_names);
1497 for (i = 0; i < n_short_names; i++)
1499 const char *s = var_get_short_name (var, i);
1500 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1503 /* Set long name. */
1504 dict_rename_var (dict, var, new_name);
1506 /* Restore short names. */
1507 for (i = 0; i < n_short_names; i++)
1509 var_set_short_name (var, i, short_names[i]);
1510 free (short_names[i]);
1515 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1516 to each short name. Modifies variable names in DICT accordingly. */
1518 parse_long_var_name_map (struct sfm_reader *r,
1519 const struct sfm_extension_record *record,
1520 struct dictionary *dict)
1522 struct text_record *text;
1523 struct variable *var;
1528 /* Convert variable names to lowercase. */
1531 for (i = 0; i < dict_get_var_cnt (dict); i++)
1533 struct variable *var = dict_get_var (dict, i);
1536 new_name = xstrdup (var_get_name (var));
1537 str_lowercase (new_name);
1539 rename_var_and_save_short_names (dict, var, new_name);
1547 /* Rename each of the variables, one by one. (In a correctly constructed
1548 system file, this cannot create any intermediate duplicate variable names,
1549 because all of the new variable names are longer than any of the old
1550 variable names and thus there cannot be any overlaps.) */
1551 text = open_text_record (r, record, true);
1552 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1554 /* Validate long name. */
1555 /* XXX need to reencode name to UTF-8 */
1556 if (!dict_id_is_valid (dict, long_name, false))
1558 sys_warn (r, record->pos,
1559 _("Long variable mapping from %s to invalid "
1560 "variable name `%s'."),
1561 var_get_name (var), long_name);
1565 /* Identify any duplicates. */
1566 if (strcasecmp (var_get_short_name (var, 0), long_name)
1567 && dict_lookup_var (dict, long_name) != NULL)
1569 sys_warn (r, record->pos,
1570 _("Duplicate long variable name `%s'."), long_name);
1574 rename_var_and_save_short_names (dict, var, long_name);
1576 close_text_record (r, text);
1579 /* Reads record type 7, subtype 14, which gives the real length
1580 of each very long string. Rearranges DICT accordingly. */
1582 parse_long_string_map (struct sfm_reader *r,
1583 const struct sfm_extension_record *record,
1584 struct dictionary *dict)
1586 struct text_record *text;
1587 struct variable *var;
1590 text = open_text_record (r, record, true);
1591 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1593 size_t idx = var_get_dict_index (var);
1599 length = strtol (length_s, NULL, 10);
1600 if (length < 1 || length > MAX_STRING)
1602 sys_warn (r, record->pos,
1603 _("%s listed as string of invalid length %s "
1604 "in very long string record."),
1605 var_get_name (var), length_s);
1609 /* Check segments. */
1610 segment_cnt = sfm_width_to_segments (length);
1611 if (segment_cnt == 1)
1613 sys_warn (r, record->pos,
1614 _("%s listed in very long string record with width %s, "
1615 "which requires only one segment."),
1616 var_get_name (var), length_s);
1619 if (idx + segment_cnt > dict_get_var_cnt (dict))
1620 sys_error (r, record->pos,
1621 _("Very long string %s overflows dictionary."),
1622 var_get_name (var));
1624 /* Get the short names from the segments and check their
1626 for (i = 0; i < segment_cnt; i++)
1628 struct variable *seg = dict_get_var (dict, idx + i);
1629 int alloc_width = sfm_segment_alloc_width (length, i);
1630 int width = var_get_width (seg);
1633 var_set_short_name (var, i, var_get_short_name (seg, 0));
1634 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1635 sys_error (r, record->pos,
1636 _("Very long string with width %ld has segment %d "
1637 "of width %d (expected %d)."),
1638 length, i, width, alloc_width);
1640 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1641 var_set_width (var, length);
1643 close_text_record (r, text);
1644 dict_compact_values (dict);
1648 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1649 const struct sfm_var_record *var_recs, size_t n_var_recs,
1650 const struct sfm_value_label_record *record)
1652 struct variable **vars;
1656 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1657 for (i = 0; i < record->n_labels; i++)
1658 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1659 record->labels[i].label, -1,
1662 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1663 for (i = 0; i < record->n_vars; i++)
1664 vars[i] = lookup_var_by_index (r, record->pos,
1665 var_recs, n_var_recs, record->vars[i]);
1667 for (i = 1; i < record->n_vars; i++)
1668 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1669 sys_error (r, record->pos,
1670 _("Variables associated with value label are not all of "
1671 "identical type. Variable %s is %s, but variable "
1673 var_get_name (vars[0]),
1674 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1675 var_get_name (vars[i]),
1676 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1678 for (i = 0; i < record->n_vars; i++)
1680 struct variable *var = vars[i];
1684 width = var_get_width (var);
1686 sys_error (r, record->pos,
1687 _("Value labels may not be added to long string "
1688 "variables (e.g. %s) using records types 3 and 4."),
1689 var_get_name (var));
1691 for (j = 0; j < record->n_labels; j++)
1693 struct sfm_value_label *label = &record->labels[j];
1696 value_init (&value, width);
1698 value.f = parse_float (r, label->value, 0);
1700 memcpy (value_str_rw (&value, width), label->value, width);
1702 if (!var_add_value_label (var, &value, utf8_labels[j]))
1704 if (var_is_numeric (var))
1705 sys_warn (r, record->pos,
1706 _("Duplicate value label for %g on %s."),
1707 value.f, var_get_name (var));
1709 sys_warn (r, record->pos,
1710 _("Duplicate value label for `%.*s' on %s."),
1711 width, value_str (&value, width),
1712 var_get_name (var));
1715 value_destroy (&value, width);
1719 pool_free (r->pool, vars);
1720 for (i = 0; i < record->n_labels; i++)
1721 pool_free (r->pool, utf8_labels[i]);
1722 pool_free (r->pool, utf8_labels);
1725 static struct variable *
1726 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1727 const struct sfm_var_record *var_recs, size_t n_var_recs,
1730 const struct sfm_var_record *rec;
1732 if (idx < 1 || idx > n_var_recs)
1734 sys_error (r, offset,
1735 _("Variable index %d not in valid range 1...%zu."),
1740 rec = &var_recs[idx - 1];
1741 if (rec->var == NULL)
1743 sys_error (r, offset,
1744 _("Variable index %d refers to long string continuation."),
1752 /* Parses a set of custom attributes from TEXT into ATTRS.
1753 ATTRS may be a null pointer, in which case the attributes are
1754 read but discarded. */
1756 parse_attributes (struct sfm_reader *r, struct text_record *text,
1757 struct attrset *attrs)
1761 struct attribute *attr;
1765 /* Parse the key. */
1766 key = text_get_token (text, ss_cstr ("("), NULL);
1770 attr = attribute_create (key);
1771 for (index = 1; ; index++)
1773 /* Parse the value. */
1777 value = text_get_token (text, ss_cstr ("\n"), NULL);
1780 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1785 length = strlen (value);
1786 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1788 value[length - 1] = '\0';
1789 attribute_add_value (attr, value + 1);
1794 _("Attribute value %s[%d] is not quoted: %s."),
1796 attribute_add_value (attr, value);
1799 /* Was this the last value for this attribute? */
1800 if (text_match (text, ')'))
1804 attrset_add (attrs, attr);
1806 attribute_destroy (attr);
1808 while (!text_match (text, '/'));
1811 /* Reads record type 7, subtype 17, which lists custom
1812 attributes on the data file. */
1814 parse_data_file_attributes (struct sfm_reader *r,
1815 const struct sfm_extension_record *record,
1816 struct dictionary *dict)
1818 struct text_record *text = open_text_record (r, record, true);
1819 parse_attributes (r, text, dict_get_attributes (dict));
1820 close_text_record (r, text);
1823 /* Parses record type 7, subtype 18, which lists custom
1824 attributes on individual variables. */
1826 parse_variable_attributes (struct sfm_reader *r,
1827 const struct sfm_extension_record *record,
1828 struct dictionary *dict)
1830 struct text_record *text;
1831 struct variable *var;
1833 text = open_text_record (r, record, true);
1834 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1835 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1836 close_text_record (r, text);
1840 check_overflow (struct sfm_reader *r,
1841 const struct sfm_extension_record *record,
1842 size_t ofs, size_t length)
1844 size_t end = record->size * record->count;
1845 if (length >= end || ofs + length > end)
1846 sys_error (r, record->pos + end,
1847 _("Long string value label record ends unexpectedly."));
1851 parse_long_string_value_labels (struct sfm_reader *r,
1852 const struct sfm_extension_record *record,
1853 struct dictionary *dict)
1855 const char *dict_encoding = dict_get_encoding (dict);
1856 size_t end = record->size * record->count;
1863 struct variable *var;
1868 /* Parse variable name length. */
1869 check_overflow (r, record, ofs, 4);
1870 var_name_len = parse_int (r, record->data, ofs);
1873 /* Parse variable name, width, and number of labels. */
1874 check_overflow (r, record, ofs, var_name_len + 8);
1875 var_name = recode_string_pool ("UTF-8", dict_encoding,
1876 (const char *) record->data + ofs,
1877 var_name_len, r->pool);
1878 width = parse_int (r, record->data, ofs + var_name_len);
1879 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1880 ofs += var_name_len + 8;
1882 /* Look up 'var' and validate. */
1883 var = dict_lookup_var (dict, var_name);
1885 sys_warn (r, record->pos + ofs,
1886 _("Ignoring long string value record for "
1887 "unknown variable %s."), var_name);
1888 else if (var_is_numeric (var))
1890 sys_warn (r, record->pos + ofs,
1891 _("Ignoring long string value record for "
1892 "numeric variable %s."), var_name);
1895 else if (width != var_get_width (var))
1897 sys_warn (r, record->pos + ofs,
1898 _("Ignoring long string value record for variable %s "
1899 "because the record's width (%d) does not match the "
1900 "variable's width (%d)."),
1901 var_name, width, var_get_width (var));
1906 value_init_pool (r->pool, &value, width);
1907 for (i = 0; i < n_labels; i++)
1909 size_t value_length, label_length;
1910 bool skip = var == NULL;
1912 /* Parse value length. */
1913 check_overflow (r, record, ofs, 4);
1914 value_length = parse_int (r, record->data, ofs);
1918 check_overflow (r, record, ofs, value_length);
1921 if (value_length == width)
1922 memcpy (value_str_rw (&value, width),
1923 (const uint8_t *) record->data + ofs, width);
1926 sys_warn (r, record->pos + ofs,
1927 _("Ignoring long string value %zu for variable "
1928 "%s, with width %d, that has bad value "
1930 i, var_get_name (var), width, value_length);
1934 ofs += value_length;
1936 /* Parse label length. */
1937 check_overflow (r, record, ofs, 4);
1938 label_length = parse_int (r, record->data, ofs);
1942 check_overflow (r, record, ofs, label_length);
1947 label = recode_string_pool ("UTF-8", dict_encoding,
1948 (const char *) record->data + ofs,
1949 label_length, r->pool);
1950 if (!var_add_value_label (var, &value, label))
1951 sys_warn (r, record->pos + ofs,
1952 _("Duplicate value label for `%.*s' on %s."),
1953 width, value_str (&value, width),
1954 var_get_name (var));
1955 pool_free (r->pool, label);
1957 ofs += label_length;
1964 static void partial_record (struct sfm_reader *r)
1967 static void read_error (struct casereader *, const struct sfm_reader *);
1969 static bool read_case_number (struct sfm_reader *, double *);
1970 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1971 static int read_opcode (struct sfm_reader *);
1972 static bool read_compressed_number (struct sfm_reader *, double *);
1973 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1974 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1975 static bool skip_whole_strings (struct sfm_reader *, size_t);
1977 /* Reads and returns one case from READER's file. Returns a null
1978 pointer if not successful. */
1979 static struct ccase *
1980 sys_file_casereader_read (struct casereader *reader, void *r_)
1982 struct sfm_reader *r = r_;
1983 struct ccase *volatile c;
1989 c = case_create (r->proto);
1990 if (setjmp (r->bail_out))
1992 casereader_force_error (reader);
1997 for (i = 0; i < r->sfm_var_cnt; i++)
1999 struct sfm_var *sv = &r->sfm_vars[i];
2000 union value *v = case_data_rw_idx (c, sv->case_index);
2002 if (sv->var_width == 0)
2004 if (!read_case_number (r, &v->f))
2009 uint8_t *s = value_str_rw (v, sv->var_width);
2010 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2012 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2021 if (r->case_cnt != -1)
2022 read_error (reader, r);
2027 /* Issues an error that R ends in a partial record. */
2029 partial_record (struct sfm_reader *r)
2031 sys_error (r, r->pos, _("File ends in partial case."));
2034 /* Issues an error that an unspecified error occurred SFM, and
2037 read_error (struct casereader *r, const struct sfm_reader *sfm)
2039 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2040 casereader_force_error (r);
2043 /* Reads a number from R and stores its value in *D.
2044 If R is compressed, reads a compressed number;
2045 otherwise, reads a number in the regular way.
2046 Returns true if successful, false if end of file is
2047 reached immediately. */
2049 read_case_number (struct sfm_reader *r, double *d)
2054 if (!try_read_bytes (r, number, sizeof number))
2056 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2060 return read_compressed_number (r, d);
2063 /* Reads LENGTH string bytes from R into S.
2064 Always reads a multiple of 8 bytes; if LENGTH is not a
2065 multiple of 8, then extra bytes are read and discarded without
2067 Reads compressed strings if S is compressed.
2068 Returns true if successful, false if end of file is
2069 reached immediately. */
2071 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2073 size_t whole = ROUND_DOWN (length, 8);
2074 size_t partial = length % 8;
2078 if (!read_whole_strings (r, s, whole))
2085 if (!read_whole_strings (r, bounce, sizeof bounce))
2091 memcpy (s + whole, bounce, partial);
2097 /* Reads and returns the next compression opcode from R. */
2099 read_opcode (struct sfm_reader *r)
2101 assert (r->compressed);
2105 if (r->opcode_idx >= sizeof r->opcodes)
2107 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2111 opcode = r->opcodes[r->opcode_idx++];
2118 /* Reads a compressed number from R and stores its value in D.
2119 Returns true if successful, false if end of file is
2120 reached immediately. */
2122 read_compressed_number (struct sfm_reader *r, double *d)
2124 int opcode = read_opcode (r);
2132 *d = read_float (r);
2136 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2137 if (!r->corruption_warning)
2139 r->corruption_warning = true;
2140 sys_warn (r, r->pos,
2141 _("Possible compressed data corruption: "
2142 "compressed spaces appear in numeric field."));
2151 *d = opcode - r->bias;
2158 /* Reads a compressed 8-byte string segment from R and stores it
2160 Returns true if successful, false if end of file is
2161 reached immediately. */
2163 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2165 int opcode = read_opcode (r);
2173 read_bytes (r, dst, 8);
2177 memset (dst, ' ', 8);
2182 double value = opcode - r->bias;
2183 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2186 /* This has actually been seen "in the wild". The submitter of the
2187 file that showed that the contents decoded as spaces, but they
2188 were at the end of the field so it's possible that the null
2189 bytes just acted as null terminators. */
2191 else if (!r->corruption_warning)
2193 r->corruption_warning = true;
2194 sys_warn (r, r->pos,
2195 _("Possible compressed data corruption: "
2196 "string contains compressed integer (opcode %d)."),
2206 /* Reads LENGTH string bytes from R into S.
2207 LENGTH must be a multiple of 8.
2208 Reads compressed strings if S is compressed.
2209 Returns true if successful, false if end of file is
2210 reached immediately. */
2212 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2214 assert (length % 8 == 0);
2216 return try_read_bytes (r, s, length);
2220 for (ofs = 0; ofs < length; ofs += 8)
2221 if (!read_compressed_string (r, s + ofs))
2231 /* Skips LENGTH string bytes from R.
2232 LENGTH must be a multiple of 8.
2233 (LENGTH is also limited to 1024, but that's only because the
2234 current caller never needs more than that many bytes.)
2235 Returns true if successful, false if end of file is
2236 reached immediately. */
2238 skip_whole_strings (struct sfm_reader *r, size_t length)
2240 uint8_t buffer[1024];
2241 assert (length < sizeof buffer);
2242 return read_whole_strings (r, buffer, length);
2245 /* Helpers for reading records that contain structured text
2248 /* Maximum number of warnings to issue for a single text
2250 #define MAX_TEXT_WARNINGS 5
2255 struct substring buffer; /* Record contents. */
2256 off_t start; /* Starting offset in file. */
2257 size_t pos; /* Current position in buffer. */
2258 int n_warnings; /* Number of warnings issued or suppressed. */
2259 bool recoded; /* Recoded into UTF-8? */
2262 static struct text_record *
2263 open_text_record (struct sfm_reader *r,
2264 const struct sfm_extension_record *record,
2265 bool recode_to_utf8)
2267 struct text_record *text;
2268 struct substring raw;
2270 text = pool_alloc (r->pool, sizeof *text);
2271 raw = ss_buffer (record->data, record->size * record->count);
2272 text->start = record->pos;
2273 text->buffer = (recode_to_utf8
2274 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2277 text->n_warnings = 0;
2278 text->recoded = recode_to_utf8;
2283 /* Closes TEXT, frees its storage, and issues a final warning
2284 about suppressed warnings if necesary. */
2286 close_text_record (struct sfm_reader *r, struct text_record *text)
2288 if (text->n_warnings > MAX_TEXT_WARNINGS)
2289 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2290 text->n_warnings - MAX_TEXT_WARNINGS);
2292 pool_free (r->pool, ss_data (text->buffer));
2295 /* Reads a variable=value pair from TEXT.
2296 Looks up the variable in DICT and stores it into *VAR.
2297 Stores a null-terminated value into *VALUE. */
2299 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2300 struct text_record *text,
2301 struct variable **var, char **value)
2305 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2308 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2312 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2313 ss_buffer ("\t\0", 2));
2321 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2322 struct text_record *text, struct substring delimiters,
2323 struct variable **var)
2327 name = text_get_token (text, delimiters, NULL);
2331 *var = dict_lookup_var (dict, name);
2335 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2342 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2343 struct text_record *text, struct substring delimiters,
2344 struct variable **var)
2346 char *short_name = text_get_token (text, delimiters, NULL);
2347 if (short_name == NULL)
2350 *var = dict_lookup_var (dict, short_name);
2352 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2357 /* Displays a warning for the current file position, limiting the
2358 number to MAX_TEXT_WARNINGS for TEXT. */
2360 text_warn (struct sfm_reader *r, struct text_record *text,
2361 const char *format, ...)
2363 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2367 va_start (args, format);
2368 sys_msg (r, text->start + text->pos, MW, format, args);
2374 text_get_token (struct text_record *text, struct substring delimiters,
2377 struct substring token;
2380 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2383 end = &ss_data (token)[ss_length (token)];
2384 if (delimiter != NULL)
2387 return ss_data (token);
2390 /* Reads a integer value expressed in decimal, then a space, then a string that
2391 consists of exactly as many bytes as specified by the integer, then a space,
2392 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2393 buffer (so the caller should not free the string). */
2395 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2405 int c = text->buffer.string[text->pos];
2406 if (c < '0' || c > '9')
2408 n = (n * 10) + (c - '0');
2411 if (start == text->pos)
2413 sys_warn (r, text->start,
2414 _("Expecting digit at offset %zu in MRSETS record."),
2419 if (!text_match (text, ' '))
2421 sys_warn (r, text->start,
2422 _("Expecting space at offset %zu in MRSETS record."),
2427 if (text->pos + n > text->buffer.length)
2429 sys_warn (r, text->start,
2430 _("%zu-byte string starting at offset %zu "
2431 "exceeds record length %zu."),
2432 n, text->pos, text->buffer.length);
2436 s = &text->buffer.string[text->pos];
2439 sys_warn (r, text->start,
2440 _("Expecting space at offset %zu following %zu-byte string."),
2450 text_match (struct text_record *text, char c)
2452 if (text->buffer.string[text->pos] == c)
2461 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
2462 inside the TEXT's string. */
2464 text_pos (const struct text_record *text)
2471 /* Displays a corruption message. */
2473 sys_msg (struct sfm_reader *r, off_t offset,
2474 int class, const char *format, va_list args)
2479 ds_init_empty (&text);
2481 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2482 fh_get_file_name (r->fh), (long long int) offset);
2484 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2485 ds_put_vformat (&text, format, args);
2487 m.category = msg_class_to_category (class);
2488 m.severity = msg_class_to_severity (class);
2494 m.text = ds_cstr (&text);
2499 /* Displays a warning for offset OFFSET in the file. */
2501 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2505 va_start (args, format);
2506 sys_msg (r, offset, MW, format, args);
2510 /* Displays an error for the current file position,
2511 marks it as in an error state,
2512 and aborts reading it using longjmp. */
2514 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2518 va_start (args, format);
2519 sys_msg (r, offset, ME, format, args);
2523 longjmp (r->bail_out, 1);
2526 /* Reads BYTE_CNT bytes into BUF.
2527 Returns true if exactly BYTE_CNT bytes are successfully read.
2528 Aborts if an I/O error or a partial read occurs.
2529 If EOF_IS_OK, then an immediate end-of-file causes false to be
2530 returned; otherwise, immediate end-of-file causes an abort
2533 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2534 void *buf, size_t byte_cnt)
2536 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2537 r->pos += bytes_read;
2538 if (bytes_read == byte_cnt)
2540 else if (ferror (r->file))
2541 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2542 else if (!eof_is_ok || bytes_read != 0)
2543 sys_error (r, r->pos, _("Unexpected end of file."));
2548 /* Reads BYTE_CNT into BUF.
2549 Aborts upon I/O error or if end-of-file is encountered. */
2551 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2553 read_bytes_internal (r, false, buf, byte_cnt);
2556 /* Reads BYTE_CNT bytes into BUF.
2557 Returns true if exactly BYTE_CNT bytes are successfully read.
2558 Returns false if an immediate end-of-file is encountered.
2559 Aborts if an I/O error or a partial read occurs. */
2561 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2563 return read_bytes_internal (r, true, buf, byte_cnt);
2566 /* Reads a 32-bit signed integer from R and returns its value in
2569 read_int (struct sfm_reader *r)
2572 read_bytes (r, integer, sizeof integer);
2573 return integer_get (r->integer_format, integer, sizeof integer);
2576 /* Reads a 64-bit floating-point number from R and returns its
2577 value in host format. */
2579 read_float (struct sfm_reader *r)
2582 read_bytes (r, number, sizeof number);
2583 return float_get_double (r->float_format, number);
2587 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2589 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2593 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2595 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2598 /* Reads exactly SIZE - 1 bytes into BUFFER
2599 and stores a null byte into BUFFER[SIZE - 1]. */
2601 read_string (struct sfm_reader *r, char *buffer, size_t size)
2604 read_bytes (r, buffer, size - 1);
2605 buffer[size - 1] = '\0';
2608 /* Skips BYTES bytes forward in R. */
2610 skip_bytes (struct sfm_reader *r, size_t bytes)
2615 size_t chunk = MIN (sizeof buffer, bytes);
2616 read_bytes (r, buffer, chunk);
2621 static const struct casereader_class sys_file_casereader_class =
2623 sys_file_casereader_read,
2624 sys_file_casereader_destroy,