1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/identifier.h"
37 #include "data/missing-values.h"
38 #include "data/mrset.h"
39 #include "data/short-names.h"
40 #include "data/value-labels.h"
41 #include "data/value.h"
42 #include "data/variable.h"
43 #include "libpspp/array.h"
44 #include "libpspp/assertion.h"
45 #include "libpspp/compiler.h"
46 #include "libpspp/i18n.h"
47 #include "libpspp/message.h"
48 #include "libpspp/misc.h"
49 #include "libpspp/pool.h"
50 #include "libpspp/str.h"
51 #include "libpspp/stringi-set.h"
53 #include "gl/c-ctype.h"
54 #include "gl/inttostr.h"
55 #include "gl/localcharset.h"
56 #include "gl/minmax.h"
57 #include "gl/unlocked-io.h"
58 #include "gl/xalloc.h"
62 #define _(msgid) gettext (msgid)
63 #define N_(msgid) (msgid)
67 /* subtypes 0-2 unknown */
68 EXT_INTEGER = 3, /* Machine integer info. */
69 EXT_FLOAT = 4, /* Machine floating-point info. */
70 EXT_VAR_SETS = 5, /* Variable sets. */
71 EXT_DATE = 6, /* DATE. */
72 EXT_MRSETS = 7, /* Multiple response sets. */
73 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
74 /* subtypes 9-10 unknown */
75 EXT_DISPLAY = 11, /* Variable display parameters. */
76 /* subtype 12 unknown */
77 EXT_LONG_NAMES = 13, /* Long variable names. */
78 EXT_LONG_STRINGS = 14, /* Long strings. */
79 /* subtype 15 unknown */
80 EXT_NCASES = 16, /* Extended number of cases. */
81 EXT_FILE_ATTRS = 17, /* Data file attributes. */
82 EXT_VAR_ATTRS = 18, /* Variable attributes. */
83 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
84 EXT_ENCODING = 20, /* Character encoding. */
85 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
95 int missing_value_code;
101 struct sfm_value_label
107 struct sfm_value_label_record
110 struct sfm_value_label *labels;
117 struct sfm_document_record
124 struct sfm_extension_record
126 off_t pos; /* Starting offset in file. */
127 size_t size; /* Size of data elements. */
128 size_t count; /* Number of data elements. */
129 void *data; /* Contents. */
132 /* System file reader. */
135 /* Resource tracking. */
136 struct pool *pool; /* All system file state. */
137 jmp_buf bail_out; /* longjmp() target for error handling. */
140 struct file_handle *fh; /* File handle. */
141 struct fh_lock *lock; /* Mutual exclusion for file handle. */
142 FILE *file; /* File stream. */
143 off_t pos; /* Position in file. */
144 bool error; /* I/O or corruption error? */
145 struct caseproto *proto; /* Format of output cases. */
148 enum integer_format integer_format; /* On-disk integer format. */
149 enum float_format float_format; /* On-disk floating point format. */
150 struct sfm_var *sfm_vars; /* Variables. */
151 size_t sfm_var_cnt; /* Number of variables. */
152 casenumber case_cnt; /* Number of cases */
153 const char *encoding; /* String encoding. */
156 bool compressed; /* File is compressed? */
157 double bias; /* Compression bias, usually 100.0. */
158 uint8_t opcodes[8]; /* Current block of opcodes. */
159 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
160 bool corruption_warning; /* Warned about possible corruption? */
163 static const struct casereader_class sys_file_casereader_class;
165 static bool close_reader (struct sfm_reader *);
167 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
168 const struct sfm_var_record *,
171 static void sys_msg (struct sfm_reader *r, off_t, int class,
172 const char *format, va_list args)
173 PRINTF_FORMAT (4, 0);
174 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
175 PRINTF_FORMAT (3, 4);
176 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
180 static void read_bytes (struct sfm_reader *, void *, size_t);
181 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
182 static int read_int (struct sfm_reader *);
183 static double read_float (struct sfm_reader *);
184 static void read_string (struct sfm_reader *, char *, size_t);
185 static void skip_bytes (struct sfm_reader *, size_t);
187 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
188 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
190 static void read_variable_record (struct sfm_reader *,
191 struct sfm_var_record *);
192 static void read_value_label_record (struct sfm_reader *,
193 struct sfm_value_label_record *,
195 static struct sfm_document_record *read_document_record (struct sfm_reader *);
196 static struct sfm_extension_record *read_extension_record (
197 struct sfm_reader *, int subtype);
198 static void skip_extension_record (struct sfm_reader *, int subtype);
200 static const char *choose_encoding (
202 const struct sfm_extension_record *ext_integer,
203 const struct sfm_extension_record *ext_encoding);
205 static struct text_record *open_text_record (
206 struct sfm_reader *, const struct sfm_extension_record *);
207 static void close_text_record (struct sfm_reader *,
208 struct text_record *);
209 static bool read_variable_to_value_pair (struct sfm_reader *,
211 struct text_record *,
212 struct variable **var, char **value);
213 static void text_warn (struct sfm_reader *r, struct text_record *text,
214 const char *format, ...)
215 PRINTF_FORMAT (3, 4);
216 static char *text_get_token (struct text_record *,
217 struct substring delimiters, char *delimiter);
218 static bool text_match (struct text_record *, char c);
219 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
220 struct text_record *,
221 struct substring delimiters,
223 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
224 struct text_record *,
225 struct substring delimiters,
227 static const char *text_parse_counted_string (struct sfm_reader *,
228 struct text_record *);
229 static size_t text_pos (const struct text_record *);
231 static bool close_reader (struct sfm_reader *r);
233 /* Dictionary reader. */
241 static void read_header (struct sfm_reader *, int *weight_idx,
242 int *claimed_oct_cnt, struct sfm_read_info *,
244 static void parse_file_label (struct sfm_reader *, const char *file_label,
245 struct dictionary *);
246 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
247 struct sfm_var_record *, size_t n);
248 static void parse_format_spec (struct sfm_reader *, off_t pos,
249 unsigned int format, enum which_format,
250 struct variable *, int *format_warning_cnt);
251 static void parse_document (struct dictionary *, struct sfm_document_record *);
252 static void parse_display_parameters (struct sfm_reader *,
253 const struct sfm_extension_record *,
254 struct dictionary *);
255 static void parse_machine_integer_info (struct sfm_reader *,
256 const struct sfm_extension_record *,
257 struct sfm_read_info *);
258 static void parse_machine_float_info (struct sfm_reader *,
259 const struct sfm_extension_record *);
260 static void parse_mrsets (struct sfm_reader *,
261 const struct sfm_extension_record *,
262 struct dictionary *);
263 static void parse_long_var_name_map (struct sfm_reader *,
264 const struct sfm_extension_record *,
265 struct dictionary *);
266 static void parse_long_string_map (struct sfm_reader *,
267 const struct sfm_extension_record *,
268 struct dictionary *);
269 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
270 const struct sfm_var_record *,
272 const struct sfm_value_label_record *);
273 static void parse_data_file_attributes (struct sfm_reader *,
274 const struct sfm_extension_record *,
275 struct dictionary *);
276 static void parse_variable_attributes (struct sfm_reader *,
277 const struct sfm_extension_record *,
278 struct dictionary *);
279 static void parse_long_string_value_labels (struct sfm_reader *,
280 const struct sfm_extension_record *,
281 struct dictionary *);
283 /* Opens the system file designated by file handle FH for
284 reading. Reads the system file's dictionary into *DICT.
285 If INFO is non-null, then it receives additional info about the
288 sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
289 struct sfm_read_info *volatile info)
291 struct sfm_reader *volatile r = NULL;
292 struct sfm_read_info local_info;
294 struct sfm_var_record *vars;
295 size_t n_vars, allocated_vars;
297 struct sfm_value_label_record *labels;
298 size_t n_labels, allocated_labels;
300 struct sfm_document_record *document;
302 struct sfm_extension_record *extensions[32];
308 struct dictionary *dict = NULL;
311 /* Create and initialize reader. */
312 r = pool_create_container (struct sfm_reader, pool);
318 r->opcode_idx = sizeof r->opcodes;
319 r->corruption_warning = false;
321 /* TRANSLATORS: this fragment will be interpolated into
322 messages in fh_lock() that identify types of files. */
323 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
327 r->file = fn_open (fh_get_file_name (fh), "rb");
330 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
331 fh_get_file_name (r->fh), strerror (errno));
335 /* Initialize info. */
338 memset (info, 0, sizeof *info);
340 if (setjmp (r->bail_out))
344 read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
347 n_vars = allocated_vars = 0;
350 n_labels = allocated_labels = 0;
354 memset (extensions, 0, sizeof extensions);
364 read_int (r); /* Skip filler. */
371 if (n_vars >= allocated_vars)
372 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
374 read_variable_record (r, &vars[n_vars++]);
378 if (n_labels >= allocated_labels)
379 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
381 read_value_label_record (r, &labels[n_labels++], n_vars);
385 /* A Type 4 record is always immediately after a type 3 record,
386 so the code for type 3 records reads the type 4 record too. */
387 sys_error (r, r->pos, _("Misplaced type 4 record."));
390 if (document != NULL)
391 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
392 document = read_document_record (r);
396 subtype = read_int (r);
397 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
400 _("Unrecognized record type 7, subtype %d. Please "
401 "send a copy of this file, and the syntax which "
402 "created it to %s."),
403 subtype, PACKAGE_BUGREPORT);
404 skip_extension_record (r, subtype);
406 else if (extensions[subtype] != NULL)
409 _("Record type 7, subtype %d found here has the same "
410 "type as the record found near offset 0x%llx. "
411 "Please send a copy of this file, and the syntax "
412 "which created it to %s."),
413 subtype, (long long int) extensions[subtype]->pos,
415 skip_extension_record (r, subtype);
418 extensions[subtype] = read_extension_record (r, subtype);
422 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
427 /* Now actually parse what we read.
429 First, figure out the correct character encoding, because this determines
430 how the rest of the header data is to be interpreted. */
431 dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
432 extensions[EXT_ENCODING]));
433 r->encoding = dict_get_encoding (dict);
435 /* These records don't use variables at all. */
436 if (document != NULL)
437 parse_document (dict, document);
439 if (extensions[EXT_INTEGER] != NULL)
440 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
442 if (extensions[EXT_FLOAT] != NULL)
443 parse_machine_float_info (r, extensions[EXT_FLOAT]);
445 if (extensions[EXT_FILE_ATTRS] != NULL)
446 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
448 parse_file_label (r, file_label, dict);
450 /* Parse the variable records, the basis of almost everything else. */
451 parse_variable_records (r, dict, vars, n_vars);
453 /* Parse value labels and the weight variable immediately after the variable
454 records. These records use indexes into var_recs[], so we must parse them
455 before those indexes become invalidated by very long string variables. */
456 for (i = 0; i < n_labels; i++)
457 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
460 struct variable *weight_var;
462 weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
463 if (var_is_numeric (weight_var))
464 dict_set_weight (dict, weight_var);
466 sys_error (r, -1, _("Weighting variable must be numeric "
467 "(not string variable `%s')."),
468 var_get_name (weight_var));
471 if (extensions[EXT_DISPLAY] != NULL)
472 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
474 /* The following records use short names, so they need to be parsed before
475 parse_long_var_name_map() changes short names to long names. */
476 if (extensions[EXT_MRSETS] != NULL)
477 parse_mrsets (r, extensions[EXT_MRSETS], dict);
479 if (extensions[EXT_MRSETS2] != NULL)
480 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
482 if (extensions[EXT_LONG_STRINGS] != NULL)
483 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
485 /* Now rename variables to their long names. */
486 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
488 /* The following records use long names, so they need to follow renaming. */
489 if (extensions[EXT_VAR_ATTRS] != NULL)
490 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
492 if (extensions[EXT_LONG_LABELS] != NULL)
493 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
495 /* Warn if the actual amount of data per case differs from the
496 amount that the header claims. SPSS version 13 gets this
497 wrong when very long strings are involved, so don't warn in
499 if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
500 && info->version_major != 13)
501 sys_warn (r, -1, _("File header claims %d variable positions but "
502 "%d were read from file."),
503 claimed_oct_cnt, n_vars);
505 /* Create an index of dictionary variable widths for
506 sfm_read_case to use. We cannot use the `struct variable's
507 from the dictionary we created, because the caller owns the
508 dictionary and may destroy or modify its variables. */
509 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
510 pool_register (r->pool, free, r->sfm_vars);
511 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
514 return casereader_create_sequential
516 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
517 &sys_file_casereader_class, r);
526 /* Closes a system file after we're done with it.
527 Returns true if an I/O error has occurred on READER, false
530 close_reader (struct sfm_reader *r)
539 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
541 msg (ME, _("Error closing system file `%s': %s."),
542 fh_get_file_name (r->fh), strerror (errno));
552 pool_destroy (r->pool);
557 /* Destroys READER. */
559 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
561 struct sfm_reader *r = r_;
565 /* Returns true if FILE is an SPSS system file,
568 sfm_detect (FILE *file)
572 if (fread (rec_type, 4, 1, file) != 1)
576 return !strcmp ("$FL2", rec_type);
579 /* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the
580 system file is unweighted, or to the value index of the weight variable
581 otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
582 case that the file claims to have (although it is not always correct).
583 Initializes INFO with header information. Stores the file label as a string
584 in dictionary encoding into *FILE_LABELP. */
586 read_header (struct sfm_reader *r, int *weight_idx,
587 int *claimed_oct_cnt, struct sfm_read_info *info,
591 char eye_catcher[61];
592 uint8_t raw_layout_code[4];
594 char creation_date[10];
595 char creation_time[9];
597 struct substring product;
599 read_string (r, rec_type, sizeof rec_type);
600 read_string (r, eye_catcher, sizeof eye_catcher);
602 if (strcmp ("$FL2", rec_type) != 0)
603 sys_error (r, 0, _("This is not an SPSS system file."));
605 /* Identify integer format. */
606 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
607 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
609 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
611 || (r->integer_format != INTEGER_MSB_FIRST
612 && r->integer_format != INTEGER_LSB_FIRST))
613 sys_error (r, 64, _("This is not an SPSS system file."));
615 *claimed_oct_cnt = read_int (r);
616 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
617 *claimed_oct_cnt = -1;
619 r->compressed = read_int (r) != 0;
621 *weight_idx = read_int (r);
623 r->case_cnt = read_int (r);
624 if ( r->case_cnt > INT_MAX / 2)
627 /* Identify floating-point format and obtain compression bias. */
628 read_bytes (r, raw_bias, sizeof raw_bias);
629 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
631 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
633 if (memcmp (raw_bias, zero_bias, 8))
634 sys_warn (r, r->pos - 8,
635 _("Compression bias is not the usual "
636 "value of 100, or system file uses unrecognized "
637 "floating-point format."));
640 /* Some software is known to write all-zeros to this
641 field. Such software also writes floating-point
642 numbers in the format that we expect by default
643 (it seems that all software most likely does, in
644 reality), so don't warn in this case. */
647 if (r->integer_format == INTEGER_MSB_FIRST)
648 r->float_format = FLOAT_IEEE_DOUBLE_BE;
650 r->float_format = FLOAT_IEEE_DOUBLE_LE;
652 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
654 read_string (r, creation_date, sizeof creation_date);
655 read_string (r, creation_time, sizeof creation_time);
656 read_string (r, file_label, sizeof file_label);
659 strcpy (info->creation_date, creation_date);
660 strcpy (info->creation_time, creation_time);
661 info->integer_format = r->integer_format;
662 info->float_format = r->float_format;
663 info->compressed = r->compressed;
664 info->case_cnt = r->case_cnt;
666 product = ss_cstr (eye_catcher);
667 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
668 ss_trim (&product, ss_cstr (" "));
669 str_copy_buf_trunc (info->product, sizeof info->product,
670 ss_data (product), ss_length (product));
672 *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
675 /* Reads a variable (type 2) record from R into RECORD. */
677 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
679 int has_variable_label;
681 memset (record, 0, sizeof *record);
683 record->pos = r->pos;
684 record->width = read_int (r);
685 has_variable_label = read_int (r);
686 record->missing_value_code = read_int (r);
687 record->print_format = read_int (r);
688 record->write_format = read_int (r);
689 read_bytes (r, record->name, sizeof record->name);
691 if (has_variable_label == 1)
693 enum { MAX_LABEL_LEN = 255 };
694 size_t len, read_len;
698 /* Read up to MAX_LABEL_LEN bytes of label. */
699 read_len = MIN (MAX_LABEL_LEN, len);
700 record->label = xmalloc (read_len + 1);
701 read_string (r, record->label, read_len + 1);
703 /* Skip unread label bytes. */
704 skip_bytes (r, len - read_len);
706 /* Skip label padding up to multiple of 4 bytes. */
707 skip_bytes (r, ROUND_UP (len, 4) - len);
709 else if (has_variable_label != 0)
710 sys_error (r, record->pos,
711 _("Variable label indicator field is not 0 or 1."));
713 /* Set missing values. */
714 if (record->missing_value_code != 0)
716 int code = record->missing_value_code;
717 if (record->width == 0)
719 if (code < -3 || code > 3 || code == -1)
720 sys_error (r, record->pos,
721 _("Numeric missing value indicator field is not "
722 "-3, -2, 0, 1, 2, or 3."));
726 if (code < 1 || code > 3)
727 sys_error (r, record->pos,
728 _("String missing value indicator field is not "
732 read_bytes (r, record->missing, 8 * abs (code));
736 /* Reads value labels from R into RECORD. */
738 read_value_label_record (struct sfm_reader *r,
739 struct sfm_value_label_record *record,
744 /* Read type 3 record. */
745 record->pos = r->pos;
746 record->n_labels = read_int (r);
747 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
748 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
750 record->labels = pool_nmalloc (r->pool, record->n_labels,
751 sizeof *record->labels);
752 for (i = 0; i < record->n_labels; i++)
754 struct sfm_value_label *label = &record->labels[i];
755 unsigned char label_len;
758 read_bytes (r, label->value, sizeof label->value);
760 /* Read label length. */
761 read_bytes (r, &label_len, sizeof label_len);
762 padded_len = ROUND_UP (label_len + 1, 8);
764 /* Read label, padding. */
765 label->label = pool_malloc (r->pool, padded_len + 1);
766 read_bytes (r, label->label, padded_len - 1);
767 label->label[label_len] = '\0';
770 /* Read record type of type 4 record. */
771 if (read_int (r) != 4)
772 sys_error (r, r->pos - 4,
773 _("Variable index record (type 4) does not immediately "
774 "follow value label record (type 3) as it should."));
776 /* Read number of variables associated with value label from type 4
778 record->n_vars = read_int (r);
779 if (record->n_vars < 1 || record->n_vars > n_vars)
780 sys_error (r, r->pos - 4,
781 _("Number of variables associated with a value label (%d) "
782 "is not between 1 and the number of variables (%zu)."),
783 record->n_vars, n_vars);
784 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
785 for (i = 0; i < record->n_vars; i++)
786 record->vars[i] = read_int (r);
789 /* Reads a document record from R and returns it. */
790 static struct sfm_document_record *
791 read_document_record (struct sfm_reader *r)
793 struct sfm_document_record *record;
796 record = pool_malloc (r->pool, sizeof *record);
797 record->pos = r->pos;
799 n_lines = read_int (r);
800 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
801 sys_error (r, record->pos,
802 _("Number of document lines (%d) "
803 "must be greater than 0 and less than %d."),
804 n_lines, INT_MAX / DOC_LINE_LENGTH);
806 record->n_lines = n_lines;
807 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
808 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
814 read_extension_record_header (struct sfm_reader *r, int subtype,
815 struct sfm_extension_record *record)
817 record->pos = r->pos;
818 record->size = read_int (r);
819 record->count = read_int (r);
821 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
822 allows an extra byte for a null terminator, used by some
823 extension processing routines. */
824 if (record->size != 0
825 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
826 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
829 /* Reads an extension record from R into RECORD. */
830 static struct sfm_extension_record *
831 read_extension_record (struct sfm_reader *r, int subtype)
833 struct extension_record_type
840 static const struct extension_record_type types[] =
842 /* Implemented record types. */
843 { EXT_INTEGER, 4, 8 },
845 { EXT_MRSETS, 1, 0 },
846 { EXT_DISPLAY, 4, 0 },
847 { EXT_LONG_NAMES, 1, 0 },
848 { EXT_LONG_STRINGS, 1, 0 },
849 { EXT_NCASES, 8, 2 },
850 { EXT_FILE_ATTRS, 1, 0 },
851 { EXT_VAR_ATTRS, 1, 0 },
852 { EXT_MRSETS2, 1, 0 },
853 { EXT_ENCODING, 1, 0 },
854 { EXT_LONG_LABELS, 1, 0 },
856 /* Ignored record types. */
857 { EXT_VAR_SETS, 0, 0 },
859 { EXT_DATA_ENTRY, 0, 0 },
862 const struct extension_record_type *type;
863 struct sfm_extension_record *record;
866 record = pool_malloc (r->pool, sizeof *record);
867 read_extension_record_header (r, subtype, record);
868 n_bytes = record->count * record->size;
870 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
871 if (subtype == type->subtype)
873 if (type->size > 0 && record->size != type->size)
874 sys_warn (r, record->pos,
875 _("Record type 7, subtype %d has bad size %zu "
876 "(expected %d)."), subtype, record->size, type->size);
877 else if (type->count > 0 && record->count != type->count)
878 sys_warn (r, record->pos,
879 _("Record type 7, subtype %d has bad count %zu "
880 "(expected %d)."), subtype, record->count, type->count);
881 else if (type->count == 0 && type->size == 0)
883 /* Ignore this record. */
887 char *data = pool_malloc (r->pool, n_bytes + 1);
888 data[n_bytes] = '\0';
891 read_bytes (r, record->data, n_bytes);
898 sys_warn (r, record->pos,
899 _("Unrecognized record type 7, subtype %d. Please send a "
900 "copy of this file, and the syntax which created it to %s."),
901 subtype, PACKAGE_BUGREPORT);
904 skip_bytes (r, n_bytes);
909 skip_extension_record (struct sfm_reader *r, int subtype)
911 struct sfm_extension_record record;
913 read_extension_record_header (r, subtype, &record);
914 skip_bytes (r, record.count * record.size);
918 parse_file_label (struct sfm_reader *r, const char *file_label,
919 struct dictionary *dict)
921 char *utf8_file_label;
922 size_t file_label_len;
924 utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
925 file_label, -1, r->pool);
926 file_label_len = strlen (utf8_file_label);
927 while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
929 utf8_file_label[file_label_len] = '\0';
930 dict_set_label (dict, utf8_file_label);
933 /* Reads a variable (type 2) record from R and adds the
934 corresponding variable to DICT.
935 Also skips past additional variable records for long string
938 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
939 struct sfm_var_record *var_recs, size_t n_var_recs)
941 const char *dict_encoding = dict_get_encoding (dict);
942 struct sfm_var_record *rec;
945 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
947 struct variable *var;
952 name = recode_string_pool ("UTF-8", dict_encoding,
953 rec->name, 8, r->pool);
954 name[strcspn (name, " ")] = '\0';
956 if (!dict_id_is_valid (dict, name, false)
957 || name[0] == '$' || name[0] == '#')
958 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
960 if (rec->width < 0 || rec->width > 255)
961 sys_error (r, rec->pos,
962 _("Bad width %d for variable %s."), rec->width, name);
964 var = rec->var = dict_create_var (dict, name, rec->width);
966 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
968 /* Set the short name the same as the long name. */
969 var_set_short_name (var, 0, name);
971 /* Get variable label, if any. */
976 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
977 rec->label, -1, r->pool);
978 var_set_label (var, utf8_label, false);
981 /* Set missing values. */
982 if (rec->missing_value_code != 0)
984 int width = var_get_width (var);
985 struct missing_values mv;
987 mv_init_pool (r->pool, &mv, width);
988 if (var_is_numeric (var))
990 bool has_range = rec->missing_value_code < 0;
991 int n_discrete = (has_range
992 ? rec->missing_value_code == -3
993 : rec->missing_value_code);
998 double low = parse_float (r, rec->missing, 0);
999 double high = parse_float (r, rec->missing, 8);
1000 mv_add_range (&mv, low, high);
1004 for (i = 0; i < n_discrete; i++)
1006 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1014 value_init_pool (r->pool, &value, width);
1015 value_set_missing (&value, width);
1016 for (i = 0; i < rec->missing_value_code; i++)
1018 uint8_t *s = value_str_rw (&value, width);
1019 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1020 mv_add_str (&mv, s);
1023 var_set_missing_values (var, &mv);
1027 parse_format_spec (r, rec->pos + 12, rec->print_format,
1028 PRINT_FORMAT, var, &n_warnings);
1029 parse_format_spec (r, rec->pos + 16, rec->write_format,
1030 WRITE_FORMAT, var, &n_warnings);
1032 /* Account for values.
1033 Skip long string continuation records, if any. */
1034 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1035 for (i = 1; i < n_values; i++)
1036 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1037 sys_error (r, rec->pos, _("Missing string continuation record."));
1042 /* Translates the format spec from sysfile format to internal
1045 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1046 enum which_format which, struct variable *v,
1049 const int max_warnings = 8;
1050 uint8_t raw_type = format >> 16;
1051 uint8_t w = format >> 8;
1057 if (!fmt_from_io (raw_type, &f.type))
1058 sys_error (r, pos, _("Unknown variable format %"PRIu8"."), raw_type);
1063 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
1068 if (which == PRINT_FORMAT)
1069 var_set_print_format (v, &f);
1071 var_set_write_format (v, &f);
1073 else if (++*n_warnings <= max_warnings)
1075 char fmt_string[FMT_STRING_LEN_MAX + 1];
1076 sys_warn (r, pos, _("%s variable %s has invalid %s format %s."),
1077 var_is_numeric (v) ? _("Numeric") : _("String"),
1079 which == PRINT_FORMAT ? _("print") : _("write"),
1080 fmt_to_string (&f, fmt_string));
1082 if (*n_warnings == max_warnings)
1083 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1088 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1092 for (p = record->documents;
1093 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1094 p += DOC_LINE_LENGTH)
1096 struct substring line;
1098 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1099 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1100 ss_rtrim (&line, ss_cstr (" "));
1101 line.string[line.length] = '\0';
1103 dict_add_document_line (dict, line.string, false);
1109 /* Parses record type 7, subtype 3. */
1111 parse_machine_integer_info (struct sfm_reader *r,
1112 const struct sfm_extension_record *record,
1113 struct sfm_read_info *info)
1115 int float_representation, expected_float_format;
1116 int integer_representation, expected_integer_format;
1118 /* Save version info. */
1119 info->version_major = parse_int (r, record->data, 0);
1120 info->version_minor = parse_int (r, record->data, 4);
1121 info->version_revision = parse_int (r, record->data, 8);
1123 /* Check floating point format. */
1124 float_representation = parse_int (r, record->data, 16);
1125 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1126 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1127 expected_float_format = 1;
1128 else if (r->float_format == FLOAT_Z_LONG)
1129 expected_float_format = 2;
1130 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1131 expected_float_format = 3;
1134 if (float_representation != expected_float_format)
1135 sys_error (r, record->pos, _("Floating-point representation indicated by "
1136 "system file (%d) differs from expected (%d)."),
1137 float_representation, expected_float_format);
1139 /* Check integer format. */
1140 integer_representation = parse_int (r, record->data, 24);
1141 if (r->integer_format == INTEGER_MSB_FIRST)
1142 expected_integer_format = 1;
1143 else if (r->integer_format == INTEGER_LSB_FIRST)
1144 expected_integer_format = 2;
1147 if (integer_representation != expected_integer_format)
1148 sys_warn (r, record->pos,
1149 _("Integer format indicated by system file (%d) "
1150 "differs from expected (%d)."),
1151 integer_representation, expected_integer_format);
1156 choose_encoding (struct sfm_reader *r,
1157 const struct sfm_extension_record *ext_integer,
1158 const struct sfm_extension_record *ext_encoding)
1160 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1163 return ext_encoding->data;
1165 /* But EXT_INTEGER is better than nothing as a fallback. */
1168 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1177 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1178 respectively. However, there are known to be many files in the wild
1179 with character code 2, yet have data which are clearly not ASCII.
1180 Therefore we ignore these values. */
1193 return pool_asprintf (r->pool, "CP%d", codepage);
1197 return locale_charset ();
1200 /* Parses record type 7, subtype 4. */
1202 parse_machine_float_info (struct sfm_reader *r,
1203 const struct sfm_extension_record *record)
1205 double sysmis = parse_float (r, record->data, 0);
1206 double highest = parse_float (r, record->data, 8);
1207 double lowest = parse_float (r, record->data, 16);
1209 if (sysmis != SYSMIS)
1210 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1213 if (highest != HIGHEST)
1214 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1215 highest, "HIGHEST");
1217 if (lowest != LOWEST)
1218 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1222 /* Parses record type 7, subtype 7 or 19. */
1224 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1225 struct dictionary *dict)
1227 struct text_record *text;
1228 struct mrset *mrset;
1230 text = open_text_record (r, record);
1233 const char *counted = NULL;
1236 struct stringi_set var_names;
1237 size_t allocated_vars;
1241 mrset = xzalloc (sizeof *mrset);
1243 name = text_get_token (text, ss_cstr ("="), NULL);
1246 mrset->name = xstrdup (name);
1248 if (mrset->name[0] != '$')
1250 sys_warn (r, record->pos,
1251 _("`%s' does not begin with `$' at UTF-8 offset %zu "
1252 "in MRSETS record."), mrset->name, text_pos (text));
1256 if (text_match (text, 'C'))
1258 mrset->type = MRSET_MC;
1259 if (!text_match (text, ' '))
1261 sys_warn (r, record->pos,
1262 _("Missing space following `%c' at UTF-8 offset %zu "
1263 "in MRSETS record."), 'C', text_pos (text));
1267 else if (text_match (text, 'D'))
1269 mrset->type = MRSET_MD;
1270 mrset->cat_source = MRSET_VARLABELS;
1272 else if (text_match (text, 'E'))
1276 mrset->type = MRSET_MD;
1277 mrset->cat_source = MRSET_COUNTEDVALUES;
1278 if (!text_match (text, ' '))
1280 sys_warn (r, record->pos,
1281 _("Missing space following `%c' at UTF-8 offset %zu "
1282 "in MRSETS record."), 'E', text_pos (text));
1286 number = text_get_token (text, ss_cstr (" "), NULL);
1287 if (!strcmp (number, "11"))
1288 mrset->label_from_var_label = true;
1289 else if (strcmp (number, "1"))
1290 sys_warn (r, record->pos,
1291 _("Unexpected label source value `%s' following `E' "
1292 "at UTF-8 offset %zu in MRSETS record."),
1293 number, text_pos (text));
1297 sys_warn (r, record->pos,
1298 _("Missing `C', `D', or `E' at UTF-8 offset %zu "
1299 "in MRSETS record."),
1304 if (mrset->type == MRSET_MD)
1306 counted = text_parse_counted_string (r, text);
1307 if (counted == NULL)
1311 label = text_parse_counted_string (r, text);
1314 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1316 stringi_set_init (&var_names);
1321 struct variable *var;
1322 const char *var_name;
1324 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1325 if (var_name == NULL)
1327 sys_warn (r, record->pos,
1328 _("Missing new-line parsing variable names "
1329 "at UTF-8 offset %zu in MRSETS record."),
1334 var = dict_lookup_var (dict, var_name);
1337 if (!stringi_set_insert (&var_names, var_name))
1339 sys_warn (r, record->pos,
1340 _("Duplicate variable name %s "
1341 "at UTF-8 offset %zu in MRSETS record."),
1342 var_name, text_pos (text));
1346 if (mrset->label == NULL && mrset->label_from_var_label
1347 && var_has_label (var))
1348 mrset->label = xstrdup (var_get_label (var));
1351 && var_get_type (var) != var_get_type (mrset->vars[0]))
1353 sys_warn (r, record->pos,
1354 _("MRSET %s contains both string and "
1355 "numeric variables."), name);
1358 width = MIN (width, var_get_width (var));
1360 if (mrset->n_vars >= allocated_vars)
1361 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1362 sizeof *mrset->vars);
1363 mrset->vars[mrset->n_vars++] = var;
1365 while (delimiter != '\n');
1367 if (mrset->n_vars < 2)
1369 sys_warn (r, record->pos,
1370 _("MRSET %s has only %zu variables."), mrset->name,
1372 mrset_destroy (mrset);
1376 if (mrset->type == MRSET_MD)
1378 mrset->width = width;
1379 value_init (&mrset->counted, width);
1381 mrset->counted.f = strtod (counted, NULL);
1383 value_copy_str_rpad (&mrset->counted, width,
1384 (const uint8_t *) counted, ' ');
1387 dict_add_mrset (dict, mrset);
1389 stringi_set_destroy (&var_names);
1391 mrset_destroy (mrset);
1392 close_text_record (r, text);
1395 /* Read record type 7, subtype 11, which specifies how variables
1396 should be displayed in GUI environments. */
1398 parse_display_parameters (struct sfm_reader *r,
1399 const struct sfm_extension_record *record,
1400 struct dictionary *dict)
1402 bool includes_width;
1403 bool warned = false;
1408 n_vars = dict_get_var_cnt (dict);
1409 if (record->count == 3 * n_vars)
1410 includes_width = true;
1411 else if (record->count == 2 * n_vars)
1412 includes_width = false;
1415 sys_warn (r, record->pos,
1416 _("Extension 11 has bad count %zu (for %zu variables)."),
1417 record->count, n_vars);
1422 for (i = 0; i < n_vars; ++i)
1424 struct variable *v = dict_get_var (dict, i);
1425 int measure, width, align;
1427 measure = parse_int (r, record->data, ofs);
1432 width = parse_int (r, record->data, ofs);
1438 align = parse_int (r, record->data, ofs);
1441 /* SPSS 14 sometimes seems to set string variables' measure
1443 if (0 == measure && var_is_alpha (v))
1446 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1449 sys_warn (r, record->pos,
1450 _("Invalid variable display parameters for variable "
1451 "%zu (%s). Default parameters substituted."),
1452 i, var_get_name (v));
1457 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1458 : measure == 2 ? MEASURE_ORDINAL
1460 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1461 : align == 1 ? ALIGN_RIGHT
1464 /* Older versions (SPSS 9.0) sometimes set the display
1465 width to zero. This causes confusion in the GUI, so
1466 only set the width if it is nonzero. */
1468 var_set_display_width (v, width);
1473 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1474 const char *new_name)
1476 size_t n_short_names;
1480 /* Renaming a variable may clear its short names, but we
1481 want to retain them, so we save them and re-set them
1483 n_short_names = var_get_short_name_cnt (var);
1484 short_names = xnmalloc (n_short_names, sizeof *short_names);
1485 for (i = 0; i < n_short_names; i++)
1487 const char *s = var_get_short_name (var, i);
1488 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1491 /* Set long name. */
1492 dict_rename_var (dict, var, new_name);
1494 /* Restore short names. */
1495 for (i = 0; i < n_short_names; i++)
1497 var_set_short_name (var, i, short_names[i]);
1498 free (short_names[i]);
1503 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1504 to each short name. Modifies variable names in DICT accordingly. */
1506 parse_long_var_name_map (struct sfm_reader *r,
1507 const struct sfm_extension_record *record,
1508 struct dictionary *dict)
1510 struct text_record *text;
1511 struct variable *var;
1516 /* Convert variable names to lowercase. */
1519 for (i = 0; i < dict_get_var_cnt (dict); i++)
1521 struct variable *var = dict_get_var (dict, i);
1524 new_name = xstrdup (var_get_name (var));
1525 str_lowercase (new_name);
1527 rename_var_and_save_short_names (dict, var, new_name);
1535 /* Rename each of the variables, one by one. (In a correctly constructed
1536 system file, this cannot create any intermediate duplicate variable names,
1537 because all of the new variable names are longer than any of the old
1538 variable names and thus there cannot be any overlaps.) */
1539 text = open_text_record (r, record);
1540 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1542 /* Validate long name. */
1543 /* XXX need to reencode name to UTF-8 */
1544 if (!dict_id_is_valid (dict, long_name, false))
1546 sys_warn (r, record->pos,
1547 _("Long variable mapping from %s to invalid "
1548 "variable name `%s'."),
1549 var_get_name (var), long_name);
1553 /* Identify any duplicates. */
1554 if (strcasecmp (var_get_short_name (var, 0), long_name)
1555 && dict_lookup_var (dict, long_name) != NULL)
1557 sys_warn (r, record->pos,
1558 _("Duplicate long variable name `%s'."), long_name);
1562 rename_var_and_save_short_names (dict, var, long_name);
1564 close_text_record (r, text);
1567 /* Reads record type 7, subtype 14, which gives the real length
1568 of each very long string. Rearranges DICT accordingly. */
1570 parse_long_string_map (struct sfm_reader *r,
1571 const struct sfm_extension_record *record,
1572 struct dictionary *dict)
1574 struct text_record *text;
1575 struct variable *var;
1578 text = open_text_record (r, record);
1579 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1581 size_t idx = var_get_dict_index (var);
1587 length = strtol (length_s, NULL, 10);
1588 if (length < 1 || length > MAX_STRING)
1590 sys_warn (r, record->pos,
1591 _("%s listed as string of invalid length %s "
1592 "in very long string record."),
1593 var_get_name (var), length_s);
1597 /* Check segments. */
1598 segment_cnt = sfm_width_to_segments (length);
1599 if (segment_cnt == 1)
1601 sys_warn (r, record->pos,
1602 _("%s listed in very long string record with width %s, "
1603 "which requires only one segment."),
1604 var_get_name (var), length_s);
1607 if (idx + segment_cnt > dict_get_var_cnt (dict))
1608 sys_error (r, record->pos,
1609 _("Very long string %s overflows dictionary."),
1610 var_get_name (var));
1612 /* Get the short names from the segments and check their
1614 for (i = 0; i < segment_cnt; i++)
1616 struct variable *seg = dict_get_var (dict, idx + i);
1617 int alloc_width = sfm_segment_alloc_width (length, i);
1618 int width = var_get_width (seg);
1621 var_set_short_name (var, i, var_get_short_name (seg, 0));
1622 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1623 sys_error (r, record->pos,
1624 _("Very long string with width %ld has segment %d "
1625 "of width %d (expected %d)."),
1626 length, i, width, alloc_width);
1628 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1629 var_set_width (var, length);
1631 close_text_record (r, text);
1632 dict_compact_values (dict);
1636 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1637 const struct sfm_var_record *var_recs, size_t n_var_recs,
1638 const struct sfm_value_label_record *record)
1640 struct variable **vars;
1644 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1645 for (i = 0; i < record->n_labels; i++)
1646 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1647 record->labels[i].label, -1,
1650 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1651 for (i = 0; i < record->n_vars; i++)
1652 vars[i] = lookup_var_by_index (r, record->pos,
1653 var_recs, n_var_recs, record->vars[i]);
1655 for (i = 1; i < record->n_vars; i++)
1656 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1657 sys_error (r, record->pos,
1658 _("Variables associated with value label are not all of "
1659 "identical type. Variable %s is %s, but variable "
1661 var_get_name (vars[0]),
1662 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1663 var_get_name (vars[i]),
1664 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1666 for (i = 0; i < record->n_vars; i++)
1668 struct variable *var = vars[i];
1672 width = var_get_width (var);
1674 sys_error (r, record->pos,
1675 _("Value labels may not be added to long string "
1676 "variables (e.g. %s) using records types 3 and 4."),
1677 var_get_name (var));
1679 for (j = 0; j < record->n_labels; j++)
1681 struct sfm_value_label *label = &record->labels[j];
1684 value_init (&value, width);
1686 value.f = parse_float (r, label->value, 0);
1688 memcpy (value_str_rw (&value, width), label->value, width);
1690 if (!var_add_value_label (var, &value, utf8_labels[j]))
1692 if (var_is_numeric (var))
1693 sys_warn (r, record->pos,
1694 _("Duplicate value label for %g on %s."),
1695 value.f, var_get_name (var));
1697 sys_warn (r, record->pos,
1698 _("Duplicate value label for `%.*s' on %s."),
1699 width, value_str (&value, width),
1700 var_get_name (var));
1703 value_destroy (&value, width);
1707 pool_free (r->pool, vars);
1708 for (i = 0; i < record->n_labels; i++)
1709 pool_free (r->pool, utf8_labels[i]);
1710 pool_free (r->pool, utf8_labels);
1713 static struct variable *
1714 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1715 const struct sfm_var_record *var_recs, size_t n_var_recs,
1718 const struct sfm_var_record *rec;
1720 if (idx < 1 || idx > n_var_recs)
1722 sys_error (r, offset,
1723 _("Variable index %d not in valid range 1...%d."),
1728 rec = &var_recs[idx - 1];
1729 if (rec->var == NULL)
1731 sys_error (r, offset,
1732 _("Variable index %d refers to long string continuation."),
1740 /* Parses a set of custom attributes from TEXT into ATTRS.
1741 ATTRS may be a null pointer, in which case the attributes are
1742 read but discarded. */
1744 parse_attributes (struct sfm_reader *r, struct text_record *text,
1745 struct attrset *attrs)
1749 struct attribute *attr;
1753 /* Parse the key. */
1754 key = text_get_token (text, ss_cstr ("("), NULL);
1758 attr = attribute_create (key);
1759 for (index = 1; ; index++)
1761 /* Parse the value. */
1765 value = text_get_token (text, ss_cstr ("\n"), NULL);
1768 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1773 length = strlen (value);
1774 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1776 value[length - 1] = '\0';
1777 attribute_add_value (attr, value + 1);
1782 _("Attribute value %s[%d] is not quoted: %s."),
1784 attribute_add_value (attr, value);
1787 /* Was this the last value for this attribute? */
1788 if (text_match (text, ')'))
1792 attrset_add (attrs, attr);
1794 attribute_destroy (attr);
1796 while (!text_match (text, '/'));
1799 /* Reads record type 7, subtype 17, which lists custom
1800 attributes on the data file. */
1802 parse_data_file_attributes (struct sfm_reader *r,
1803 const struct sfm_extension_record *record,
1804 struct dictionary *dict)
1806 struct text_record *text = open_text_record (r, record);
1807 parse_attributes (r, text, dict_get_attributes (dict));
1808 close_text_record (r, text);
1811 /* Parses record type 7, subtype 18, which lists custom
1812 attributes on individual variables. */
1814 parse_variable_attributes (struct sfm_reader *r,
1815 const struct sfm_extension_record *record,
1816 struct dictionary *dict)
1818 struct text_record *text;
1819 struct variable *var;
1821 text = open_text_record (r, record);
1822 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1823 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1824 close_text_record (r, text);
1828 check_overflow (struct sfm_reader *r,
1829 const struct sfm_extension_record *record,
1830 size_t ofs, size_t length)
1832 size_t end = record->size * record->count;
1833 if (length >= end || ofs + length > end)
1834 sys_error (r, record->pos + end,
1835 _("Long string value label record ends unexpectedly."));
1839 parse_long_string_value_labels (struct sfm_reader *r,
1840 const struct sfm_extension_record *record,
1841 struct dictionary *dict)
1843 const char *dict_encoding = dict_get_encoding (dict);
1844 size_t end = record->size * record->count;
1851 struct variable *var;
1856 /* Parse variable name length. */
1857 check_overflow (r, record, ofs, 4);
1858 var_name_len = parse_int (r, record->data, ofs);
1861 /* Parse variable name, width, and number of labels. */
1862 check_overflow (r, record, ofs, var_name_len + 8);
1863 var_name = recode_string_pool ("UTF-8", dict_encoding,
1864 (const char *) record->data + ofs,
1865 var_name_len, r->pool);
1866 width = parse_int (r, record->data, ofs + var_name_len);
1867 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1868 ofs += var_name_len + 8;
1870 /* Look up 'var' and validate. */
1871 var = dict_lookup_var (dict, var_name);
1873 sys_warn (r, record->pos + ofs,
1874 _("Ignoring long string value record for "
1875 "unknown variable %s."), var_name);
1876 else if (var_is_numeric (var))
1878 sys_warn (r, record->pos + ofs,
1879 _("Ignoring long string value record for "
1880 "numeric variable %s."), var_name);
1883 else if (width != var_get_width (var))
1885 sys_warn (r, record->pos + ofs,
1886 _("Ignoring long string value record for variable %s "
1887 "because the record's width (%d) does not match the "
1888 "variable's width (%d)."),
1889 var_name, width, var_get_width (var));
1894 value_init_pool (r->pool, &value, width);
1895 for (i = 0; i < n_labels; i++)
1897 size_t value_length, label_length;
1898 bool skip = var == NULL;
1900 /* Parse value length. */
1901 check_overflow (r, record, ofs, 4);
1902 value_length = parse_int (r, record->data, ofs);
1906 check_overflow (r, record, ofs, value_length);
1909 if (value_length == width)
1910 memcpy (value_str_rw (&value, width),
1911 (const uint8_t *) record->data + ofs, width);
1914 sys_warn (r, record->pos + ofs,
1915 _("Ignoring long string value %zu for variable "
1916 "%s, with width %d, that has bad value "
1918 i, var_get_name (var), width, value_length);
1922 ofs += value_length;
1924 /* Parse label length. */
1925 check_overflow (r, record, ofs, 4);
1926 label_length = parse_int (r, record->data, ofs);
1930 check_overflow (r, record, ofs, label_length);
1935 label = recode_string_pool ("UTF-8", dict_encoding,
1936 (const char *) record->data + ofs,
1937 label_length, r->pool);
1938 if (!var_add_value_label (var, &value, label))
1939 sys_warn (r, record->pos + ofs,
1940 _("Duplicate value label for `%.*s' on %s."),
1941 width, value_str (&value, width),
1942 var_get_name (var));
1943 pool_free (r->pool, label);
1945 ofs += label_length;
1952 static void partial_record (struct sfm_reader *r)
1955 static void read_error (struct casereader *, const struct sfm_reader *);
1957 static bool read_case_number (struct sfm_reader *, double *);
1958 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1959 static int read_opcode (struct sfm_reader *);
1960 static bool read_compressed_number (struct sfm_reader *, double *);
1961 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1962 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1963 static bool skip_whole_strings (struct sfm_reader *, size_t);
1965 /* Reads and returns one case from READER's file. Returns a null
1966 pointer if not successful. */
1967 static struct ccase *
1968 sys_file_casereader_read (struct casereader *reader, void *r_)
1970 struct sfm_reader *r = r_;
1971 struct ccase *volatile c;
1977 c = case_create (r->proto);
1978 if (setjmp (r->bail_out))
1980 casereader_force_error (reader);
1985 for (i = 0; i < r->sfm_var_cnt; i++)
1987 struct sfm_var *sv = &r->sfm_vars[i];
1988 union value *v = case_data_rw_idx (c, sv->case_index);
1990 if (sv->var_width == 0)
1992 if (!read_case_number (r, &v->f))
1997 uint8_t *s = value_str_rw (v, sv->var_width);
1998 if (!read_case_string (r, s + sv->offset, sv->segment_width))
2000 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2009 if (r->case_cnt != -1)
2010 read_error (reader, r);
2015 /* Issues an error that R ends in a partial record. */
2017 partial_record (struct sfm_reader *r)
2019 sys_error (r, r->pos, _("File ends in partial case."));
2022 /* Issues an error that an unspecified error occurred SFM, and
2025 read_error (struct casereader *r, const struct sfm_reader *sfm)
2027 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2028 casereader_force_error (r);
2031 /* Reads a number from R and stores its value in *D.
2032 If R is compressed, reads a compressed number;
2033 otherwise, reads a number in the regular way.
2034 Returns true if successful, false if end of file is
2035 reached immediately. */
2037 read_case_number (struct sfm_reader *r, double *d)
2042 if (!try_read_bytes (r, number, sizeof number))
2044 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2048 return read_compressed_number (r, d);
2051 /* Reads LENGTH string bytes from R into S.
2052 Always reads a multiple of 8 bytes; if LENGTH is not a
2053 multiple of 8, then extra bytes are read and discarded without
2055 Reads compressed strings if S is compressed.
2056 Returns true if successful, false if end of file is
2057 reached immediately. */
2059 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2061 size_t whole = ROUND_DOWN (length, 8);
2062 size_t partial = length % 8;
2066 if (!read_whole_strings (r, s, whole))
2073 if (!read_whole_strings (r, bounce, sizeof bounce))
2079 memcpy (s + whole, bounce, partial);
2085 /* Reads and returns the next compression opcode from R. */
2087 read_opcode (struct sfm_reader *r)
2089 assert (r->compressed);
2093 if (r->opcode_idx >= sizeof r->opcodes)
2095 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2099 opcode = r->opcodes[r->opcode_idx++];
2106 /* Reads a compressed number from R and stores its value in D.
2107 Returns true if successful, false if end of file is
2108 reached immediately. */
2110 read_compressed_number (struct sfm_reader *r, double *d)
2112 int opcode = read_opcode (r);
2120 *d = read_float (r);
2124 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2125 if (!r->corruption_warning)
2127 r->corruption_warning = true;
2128 sys_warn (r, r->pos,
2129 _("Possible compressed data corruption: "
2130 "compressed spaces appear in numeric field."));
2139 *d = opcode - r->bias;
2146 /* Reads a compressed 8-byte string segment from R and stores it
2148 Returns true if successful, false if end of file is
2149 reached immediately. */
2151 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2153 int opcode = read_opcode (r);
2161 read_bytes (r, dst, 8);
2165 memset (dst, ' ', 8);
2170 double value = opcode - r->bias;
2171 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2174 /* This has actually been seen "in the wild". The submitter of the
2175 file that showed that the contents decoded as spaces, but they
2176 were at the end of the field so it's possible that the null
2177 bytes just acted as null terminators. */
2179 else if (!r->corruption_warning)
2181 r->corruption_warning = true;
2182 sys_warn (r, r->pos,
2183 _("Possible compressed data corruption: "
2184 "string contains compressed integer (opcode %d)."),
2194 /* Reads LENGTH string bytes from R into S.
2195 LENGTH must be a multiple of 8.
2196 Reads compressed strings if S is compressed.
2197 Returns true if successful, false if end of file is
2198 reached immediately. */
2200 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2202 assert (length % 8 == 0);
2204 return try_read_bytes (r, s, length);
2208 for (ofs = 0; ofs < length; ofs += 8)
2209 if (!read_compressed_string (r, s + ofs))
2219 /* Skips LENGTH string bytes from R.
2220 LENGTH must be a multiple of 8.
2221 (LENGTH is also limited to 1024, but that's only because the
2222 current caller never needs more than that many bytes.)
2223 Returns true if successful, false if end of file is
2224 reached immediately. */
2226 skip_whole_strings (struct sfm_reader *r, size_t length)
2228 uint8_t buffer[1024];
2229 assert (length < sizeof buffer);
2230 return read_whole_strings (r, buffer, length);
2233 /* Helpers for reading records that contain structured text
2236 /* Maximum number of warnings to issue for a single text
2238 #define MAX_TEXT_WARNINGS 5
2243 struct substring buffer; /* Record contents, in UTF-8. */
2244 off_t start; /* Starting offset in file. */
2245 size_t pos; /* Current position in buffer. */
2246 int n_warnings; /* Number of warnings issued or suppressed. */
2249 static struct text_record *
2250 open_text_record (struct sfm_reader *r,
2251 const struct sfm_extension_record *record)
2253 struct text_record *text;
2254 struct substring raw;
2256 text = pool_alloc (r->pool, sizeof *text);
2257 raw = ss_buffer (record->data, record->size * record->count);
2258 text->start = record->pos;
2259 text->buffer = recode_substring_pool ("UTF-8", r->encoding, raw, r->pool);
2261 text->n_warnings = 0;
2266 /* Closes TEXT, frees its storage, and issues a final warning
2267 about suppressed warnings if necesary. */
2269 close_text_record (struct sfm_reader *r, struct text_record *text)
2271 if (text->n_warnings > MAX_TEXT_WARNINGS)
2272 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2273 text->n_warnings - MAX_TEXT_WARNINGS);
2274 pool_free (r->pool, ss_data (text->buffer));
2277 /* Reads a variable=value pair from TEXT.
2278 Looks up the variable in DICT and stores it into *VAR.
2279 Stores a null-terminated value into *VALUE. */
2281 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2282 struct text_record *text,
2283 struct variable **var, char **value)
2287 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2290 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2294 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2295 ss_buffer ("\t\0", 2));
2303 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2304 struct text_record *text, struct substring delimiters,
2305 struct variable **var)
2309 name = text_get_token (text, delimiters, NULL);
2313 *var = dict_lookup_var (dict, name);
2317 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2324 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2325 struct text_record *text, struct substring delimiters,
2326 struct variable **var)
2328 char *short_name = text_get_token (text, delimiters, NULL);
2329 if (short_name == NULL)
2332 *var = dict_lookup_var (dict, short_name);
2334 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2339 /* Displays a warning for the current file position, limiting the
2340 number to MAX_TEXT_WARNINGS for TEXT. */
2342 text_warn (struct sfm_reader *r, struct text_record *text,
2343 const char *format, ...)
2345 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2349 va_start (args, format);
2350 sys_msg (r, text->start + text->pos, MW, format, args);
2356 text_get_token (struct text_record *text, struct substring delimiters,
2359 struct substring token;
2362 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2365 end = &ss_data (token)[ss_length (token)];
2366 if (delimiter != NULL)
2369 return ss_data (token);
2372 /* Reads a integer value expressed in decimal, then a space, then a string that
2373 consists of exactly as many bytes as specified by the integer, then a space,
2374 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2375 buffer (so the caller should not free the string). */
2377 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2387 int c = text->buffer.string[text->pos];
2388 if (c < '0' || c > '9')
2390 n = (n * 10) + (c - '0');
2393 if (start == text->pos)
2395 sys_warn (r, text->start,
2396 _("Expecting digit at UTF-8 offset %zu in MRSETS record."),
2401 if (!text_match (text, ' '))
2403 sys_warn (r, text->start,
2404 _("Expecting space at UTF-8 offset %zu in MRSETS record."),
2409 if (text->pos + n > text->buffer.length)
2411 sys_warn (r, text->start,
2412 _("%zu-byte string starting at UTF-8 offset %zu "
2413 "exceeds record length %zu."),
2414 n, text->pos, text->buffer.length);
2418 s = &text->buffer.string[text->pos];
2421 sys_warn (r, text->start,
2422 _("Expecting space at UTF-8 offset %zu following %zu-byte "
2433 text_match (struct text_record *text, char c)
2435 if (text->buffer.string[text->pos] == c)
2444 /* Returns the current byte offset (as convertd to UTF-8) inside the TEXT's
2447 text_pos (const struct text_record *text)
2454 /* Displays a corruption message. */
2456 sys_msg (struct sfm_reader *r, off_t offset,
2457 int class, const char *format, va_list args)
2462 ds_init_empty (&text);
2464 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2465 fh_get_file_name (r->fh), (long long int) offset);
2467 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2468 ds_put_vformat (&text, format, args);
2470 m.category = msg_class_to_category (class);
2471 m.severity = msg_class_to_severity (class);
2477 m.text = ds_cstr (&text);
2482 /* Displays a warning for offset OFFSET in the file. */
2484 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2488 va_start (args, format);
2489 sys_msg (r, offset, MW, format, args);
2493 /* Displays an error for the current file position,
2494 marks it as in an error state,
2495 and aborts reading it using longjmp. */
2497 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2501 va_start (args, format);
2502 sys_msg (r, offset, ME, format, args);
2506 longjmp (r->bail_out, 1);
2509 /* Reads BYTE_CNT bytes into BUF.
2510 Returns true if exactly BYTE_CNT bytes are successfully read.
2511 Aborts if an I/O error or a partial read occurs.
2512 If EOF_IS_OK, then an immediate end-of-file causes false to be
2513 returned; otherwise, immediate end-of-file causes an abort
2516 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2517 void *buf, size_t byte_cnt)
2519 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2520 r->pos += bytes_read;
2521 if (bytes_read == byte_cnt)
2523 else if (ferror (r->file))
2524 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2525 else if (!eof_is_ok || bytes_read != 0)
2526 sys_error (r, r->pos, _("Unexpected end of file."));
2531 /* Reads BYTE_CNT into BUF.
2532 Aborts upon I/O error or if end-of-file is encountered. */
2534 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2536 read_bytes_internal (r, false, buf, byte_cnt);
2539 /* Reads BYTE_CNT bytes into BUF.
2540 Returns true if exactly BYTE_CNT bytes are successfully read.
2541 Returns false if an immediate end-of-file is encountered.
2542 Aborts if an I/O error or a partial read occurs. */
2544 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2546 return read_bytes_internal (r, true, buf, byte_cnt);
2549 /* Reads a 32-bit signed integer from R and returns its value in
2552 read_int (struct sfm_reader *r)
2555 read_bytes (r, integer, sizeof integer);
2556 return integer_get (r->integer_format, integer, sizeof integer);
2559 /* Reads a 64-bit floating-point number from R and returns its
2560 value in host format. */
2562 read_float (struct sfm_reader *r)
2565 read_bytes (r, number, sizeof number);
2566 return float_get_double (r->float_format, number);
2570 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2572 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2576 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2578 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2581 /* Reads exactly SIZE - 1 bytes into BUFFER
2582 and stores a null byte into BUFFER[SIZE - 1]. */
2584 read_string (struct sfm_reader *r, char *buffer, size_t size)
2587 read_bytes (r, buffer, size - 1);
2588 buffer[size - 1] = '\0';
2591 /* Skips BYTES bytes forward in R. */
2593 skip_bytes (struct sfm_reader *r, size_t bytes)
2598 size_t chunk = MIN (sizeof buffer, bytes);
2599 read_bytes (r, buffer, chunk);
2604 static const struct casereader_class sys_file_casereader_class =
2606 sys_file_casereader_read,
2607 sys_file_casereader_destroy,