1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
28 #include "data/attributes.h"
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/mrset.h"
38 #include "data/short-names.h"
39 #include "data/value-labels.h"
40 #include "data/value.h"
41 #include "data/variable.h"
42 #include "libpspp/array.h"
43 #include "libpspp/assertion.h"
44 #include "libpspp/compiler.h"
45 #include "libpspp/i18n.h"
46 #include "libpspp/message.h"
47 #include "libpspp/misc.h"
48 #include "libpspp/pool.h"
49 #include "libpspp/str.h"
50 #include "libpspp/stringi-set.h"
52 #include "gl/c-ctype.h"
53 #include "gl/inttostr.h"
54 #include "gl/localcharset.h"
55 #include "gl/minmax.h"
56 #include "gl/unlocked-io.h"
57 #include "gl/xalloc.h"
61 #define _(msgid) gettext (msgid)
62 #define N_(msgid) (msgid)
66 /* subtypes 0-2 unknown */
67 EXT_INTEGER = 3, /* Machine integer info. */
68 EXT_FLOAT = 4, /* Machine floating-point info. */
69 EXT_VAR_SETS = 5, /* Variable sets. */
70 EXT_DATE = 6, /* DATE. */
71 EXT_MRSETS = 7, /* Multiple response sets. */
72 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
73 /* subtypes 9-10 unknown */
74 EXT_DISPLAY = 11, /* Variable display parameters. */
75 /* subtype 12 unknown */
76 EXT_LONG_NAMES = 13, /* Long variable names. */
77 EXT_LONG_STRINGS = 14, /* Long strings. */
78 /* subtype 15 unknown */
79 EXT_NCASES = 16, /* Extended number of cases. */
80 EXT_FILE_ATTRS = 17, /* Data file attributes. */
81 EXT_VAR_ATTRS = 18, /* Variable attributes. */
82 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
83 EXT_ENCODING = 20, /* Character encoding. */
84 EXT_LONG_LABELS = 21 /* Value labels for long strings. */
94 int missing_value_code;
100 struct sfm_value_label
106 struct sfm_value_label_record
109 struct sfm_value_label *labels;
116 struct sfm_document_record
123 struct sfm_extension_record
125 off_t pos; /* Starting offset in file. */
126 size_t size; /* Size of data elements. */
127 size_t count; /* Number of data elements. */
128 void *data; /* Contents. */
131 /* System file reader. */
134 /* Resource tracking. */
135 struct pool *pool; /* All system file state. */
136 jmp_buf bail_out; /* longjmp() target for error handling. */
139 struct file_handle *fh; /* File handle. */
140 struct fh_lock *lock; /* Mutual exclusion for file handle. */
141 FILE *file; /* File stream. */
142 off_t pos; /* Position in file. */
143 bool error; /* I/O or corruption error? */
144 struct caseproto *proto; /* Format of output cases. */
147 enum integer_format integer_format; /* On-disk integer format. */
148 enum float_format float_format; /* On-disk floating point format. */
149 struct sfm_var *sfm_vars; /* Variables. */
150 size_t sfm_var_cnt; /* Number of variables. */
151 casenumber case_cnt; /* Number of cases */
152 const char *encoding; /* String encoding. */
155 bool compressed; /* File is compressed? */
156 double bias; /* Compression bias, usually 100.0. */
157 uint8_t opcodes[8]; /* Current block of opcodes. */
158 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
159 bool corruption_warning; /* Warned about possible corruption? */
162 static const struct casereader_class sys_file_casereader_class;
164 static bool close_reader (struct sfm_reader *);
166 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
167 const struct sfm_var_record *,
170 static void sys_msg (struct sfm_reader *r, off_t, int class,
171 const char *format, va_list args)
172 PRINTF_FORMAT (4, 0);
173 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
174 PRINTF_FORMAT (3, 4);
175 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
179 static void read_bytes (struct sfm_reader *, void *, size_t);
180 static bool try_read_bytes (struct sfm_reader *, void *, size_t);
181 static int read_int (struct sfm_reader *);
182 static double read_float (struct sfm_reader *);
183 static void read_string (struct sfm_reader *, char *, size_t);
184 static void skip_bytes (struct sfm_reader *, size_t);
186 static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
187 static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
189 static void read_variable_record (struct sfm_reader *,
190 struct sfm_var_record *);
191 static void read_value_label_record (struct sfm_reader *,
192 struct sfm_value_label_record *,
194 static struct sfm_document_record *read_document_record (struct sfm_reader *);
195 static struct sfm_extension_record *read_extension_record (
196 struct sfm_reader *, int subtype);
197 static void skip_extension_record (struct sfm_reader *, int subtype);
199 static const char *choose_encoding (
201 const struct sfm_extension_record *ext_integer,
202 const struct sfm_extension_record *ext_encoding);
204 static struct text_record *open_text_record (
205 struct sfm_reader *, const struct sfm_extension_record *);
206 static void close_text_record (struct sfm_reader *,
207 struct text_record *);
208 static bool read_variable_to_value_pair (struct sfm_reader *,
210 struct text_record *,
211 struct variable **var, char **value);
212 static void text_warn (struct sfm_reader *r, struct text_record *text,
213 const char *format, ...)
214 PRINTF_FORMAT (3, 4);
215 static char *text_get_token (struct text_record *,
216 struct substring delimiters, char *delimiter);
217 static bool text_match (struct text_record *, char c);
218 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
219 struct text_record *,
220 struct substring delimiters,
222 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
223 struct text_record *,
224 struct substring delimiters,
226 static const char *text_parse_counted_string (struct sfm_reader *,
227 struct text_record *);
228 static size_t text_pos (const struct text_record *);
230 static bool close_reader (struct sfm_reader *r);
232 /* Dictionary reader. */
240 static void read_header (struct sfm_reader *, int *weight_idx,
241 int *claimed_oct_cnt, struct sfm_read_info *,
243 static void parse_file_label (struct sfm_reader *, const char *file_label,
244 struct dictionary *);
245 static void parse_variable_records (struct sfm_reader *, struct dictionary *,
246 struct sfm_var_record *, size_t n);
247 static void parse_format_spec (struct sfm_reader *, off_t pos,
248 unsigned int format, enum which_format,
249 struct variable *, int *format_warning_cnt);
250 static void parse_document (struct dictionary *, struct sfm_document_record *);
251 static void parse_display_parameters (struct sfm_reader *,
252 const struct sfm_extension_record *,
253 struct dictionary *);
254 static void parse_machine_integer_info (struct sfm_reader *,
255 const struct sfm_extension_record *,
256 struct sfm_read_info *);
257 static void parse_machine_float_info (struct sfm_reader *,
258 const struct sfm_extension_record *);
259 static void parse_mrsets (struct sfm_reader *,
260 const struct sfm_extension_record *,
261 struct dictionary *);
262 static void parse_long_var_name_map (struct sfm_reader *,
263 const struct sfm_extension_record *,
264 struct dictionary *);
265 static void parse_long_string_map (struct sfm_reader *,
266 const struct sfm_extension_record *,
267 struct dictionary *);
268 static void parse_value_labels (struct sfm_reader *, struct dictionary *,
269 const struct sfm_var_record *,
271 const struct sfm_value_label_record *);
272 static void parse_data_file_attributes (struct sfm_reader *,
273 const struct sfm_extension_record *,
274 struct dictionary *);
275 static void parse_variable_attributes (struct sfm_reader *,
276 const struct sfm_extension_record *,
277 struct dictionary *);
278 static void parse_long_string_value_labels (struct sfm_reader *,
279 const struct sfm_extension_record *,
280 struct dictionary *);
282 /* Opens the system file designated by file handle FH for
283 reading. Reads the system file's dictionary into *DICT.
284 If INFO is non-null, then it receives additional info about the
287 sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
288 struct sfm_read_info *volatile info)
290 struct sfm_reader *volatile r = NULL;
291 struct sfm_read_info local_info;
293 struct sfm_var_record *vars;
294 size_t n_vars, allocated_vars;
296 struct sfm_value_label_record *labels;
297 size_t n_labels, allocated_labels;
299 struct sfm_document_record *document;
301 struct sfm_extension_record *extensions[32];
307 struct dictionary *dict = NULL;
310 /* Create and initialize reader. */
311 r = pool_create_container (struct sfm_reader, pool);
317 r->opcode_idx = sizeof r->opcodes;
318 r->corruption_warning = false;
320 /* TRANSLATORS: this fragment will be interpolated into
321 messages in fh_lock() that identify types of files. */
322 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
326 r->file = fn_open (fh_get_file_name (fh), "rb");
329 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
330 fh_get_file_name (r->fh), strerror (errno));
334 /* Initialize info. */
337 memset (info, 0, sizeof *info);
339 if (setjmp (r->bail_out))
343 read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
346 n_vars = allocated_vars = 0;
349 n_labels = allocated_labels = 0;
353 memset (extensions, 0, sizeof extensions);
363 read_int (r); /* Skip filler. */
370 if (n_vars >= allocated_vars)
371 vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
373 read_variable_record (r, &vars[n_vars++]);
377 if (n_labels >= allocated_labels)
378 labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
380 read_value_label_record (r, &labels[n_labels++], n_vars);
384 /* A Type 4 record is always immediately after a type 3 record,
385 so the code for type 3 records reads the type 4 record too. */
386 sys_error (r, r->pos, _("Misplaced type 4 record."));
389 if (document != NULL)
390 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
391 document = read_document_record (r);
395 subtype = read_int (r);
396 if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
399 _("Unrecognized record type 7, subtype %d. Please "
400 "send a copy of this file, and the syntax which "
401 "created it to %s."),
402 subtype, PACKAGE_BUGREPORT);
403 skip_extension_record (r, subtype);
405 else if (extensions[subtype] != NULL)
408 _("Record type 7, subtype %d found here has the same "
409 "type as the record found near offset 0x%llx. "
410 "Please send a copy of this file, and the syntax "
411 "which created it to %s."),
412 subtype, (long long int) extensions[subtype]->pos,
414 skip_extension_record (r, subtype);
417 extensions[subtype] = read_extension_record (r, subtype);
421 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
426 /* Now actually parse what we read.
428 First, figure out the correct character encoding, because this determines
429 how the rest of the header data is to be interpreted. */
430 dict = dict_create ();
431 r->encoding = choose_encoding (r, extensions[EXT_INTEGER],
432 extensions[EXT_ENCODING]);
433 dict_set_encoding (dict, r->encoding);
435 /* These records don't use variables at all. */
436 if (document != NULL)
437 parse_document (dict, document);
439 if (extensions[EXT_INTEGER] != NULL)
440 parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
442 if (extensions[EXT_FLOAT] != NULL)
443 parse_machine_float_info (r, extensions[EXT_FLOAT]);
445 if (extensions[EXT_FILE_ATTRS] != NULL)
446 parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
448 parse_file_label (r, file_label, dict);
450 /* Parse the variable records, the basis of almost everything else. */
451 parse_variable_records (r, dict, vars, n_vars);
453 /* Parse value labels and the weight variable immediately after the variable
454 records. These records use indexes into var_recs[], so we must parse them
455 before those indexes become invalidated by very long string variables. */
456 for (i = 0; i < n_labels; i++)
457 parse_value_labels (r, dict, vars, n_vars, &labels[i]);
460 struct variable *weight_var;
462 weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
463 if (var_is_numeric (weight_var))
464 dict_set_weight (dict, weight_var);
466 sys_error (r, -1, _("Weighting variable must be numeric "
467 "(not string variable `%s')."),
468 var_get_name (weight_var));
471 if (extensions[EXT_DISPLAY] != NULL)
472 parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
474 /* The following records use short names, so they need to be parsed before
475 parse_long_var_name_map() changes short names to long names. */
476 if (extensions[EXT_MRSETS] != NULL)
477 parse_mrsets (r, extensions[EXT_MRSETS], dict);
479 if (extensions[EXT_MRSETS2] != NULL)
480 parse_mrsets (r, extensions[EXT_MRSETS2], dict);
482 if (extensions[EXT_LONG_STRINGS] != NULL)
483 parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
485 /* Now rename variables to their long names. */
486 parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
488 /* The following records use long names, so they need to follow renaming. */
489 if (extensions[EXT_VAR_ATTRS] != NULL)
490 parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
492 if (extensions[EXT_LONG_LABELS] != NULL)
493 parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
495 /* Warn if the actual amount of data per case differs from the
496 amount that the header claims. SPSS version 13 gets this
497 wrong when very long strings are involved, so don't warn in
499 if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
500 && info->version_major != 13)
501 sys_warn (r, -1, _("File header claims %d variable positions but "
502 "%d were read from file."),
503 claimed_oct_cnt, n_vars);
505 /* Create an index of dictionary variable widths for
506 sfm_read_case to use. We cannot use the `struct variable's
507 from the dictionary we created, because the caller owns the
508 dictionary and may destroy or modify its variables. */
509 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
510 pool_register (r->pool, free, r->sfm_vars);
511 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
514 return casereader_create_sequential
516 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
517 &sys_file_casereader_class, r);
526 /* Closes a system file after we're done with it.
527 Returns true if an I/O error has occurred on READER, false
530 close_reader (struct sfm_reader *r)
539 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
541 msg (ME, _("Error closing system file `%s': %s."),
542 fh_get_file_name (r->fh), strerror (errno));
552 pool_destroy (r->pool);
557 /* Destroys READER. */
559 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
561 struct sfm_reader *r = r_;
565 /* Returns true if FILE is an SPSS system file,
568 sfm_detect (FILE *file)
572 if (fread (rec_type, 4, 1, file) != 1)
576 return !strcmp ("$FL2", rec_type);
579 /* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the
580 system file is unweighted, or to the value index of the weight variable
581 otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
582 case that the file claims to have (although it is not always correct).
583 Initializes INFO with header information. Stores the file label as a string
584 in dictionary encoding into *FILE_LABELP. */
586 read_header (struct sfm_reader *r, int *weight_idx,
587 int *claimed_oct_cnt, struct sfm_read_info *info,
591 char eye_catcher[61];
592 uint8_t raw_layout_code[4];
594 char creation_date[10];
595 char creation_time[9];
597 struct substring product;
599 read_string (r, rec_type, sizeof rec_type);
600 read_string (r, eye_catcher, sizeof eye_catcher);
602 if (strcmp ("$FL2", rec_type) != 0)
603 sys_error (r, 0, _("This is not an SPSS system file."));
605 /* Identify integer format. */
606 read_bytes (r, raw_layout_code, sizeof raw_layout_code);
607 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
609 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
611 || (r->integer_format != INTEGER_MSB_FIRST
612 && r->integer_format != INTEGER_LSB_FIRST))
613 sys_error (r, 64, _("This is not an SPSS system file."));
615 *claimed_oct_cnt = read_int (r);
616 if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
617 *claimed_oct_cnt = -1;
619 r->compressed = read_int (r) != 0;
621 *weight_idx = read_int (r);
623 r->case_cnt = read_int (r);
624 if ( r->case_cnt > INT_MAX / 2)
627 /* Identify floating-point format and obtain compression bias. */
628 read_bytes (r, raw_bias, sizeof raw_bias);
629 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
631 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
633 if (memcmp (raw_bias, zero_bias, 8))
634 sys_warn (r, r->pos - 8,
635 _("Compression bias is not the usual "
636 "value of 100, or system file uses unrecognized "
637 "floating-point format."));
640 /* Some software is known to write all-zeros to this
641 field. Such software also writes floating-point
642 numbers in the format that we expect by default
643 (it seems that all software most likely does, in
644 reality), so don't warn in this case. */
647 if (r->integer_format == INTEGER_MSB_FIRST)
648 r->float_format = FLOAT_IEEE_DOUBLE_BE;
650 r->float_format = FLOAT_IEEE_DOUBLE_LE;
652 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
654 read_string (r, creation_date, sizeof creation_date);
655 read_string (r, creation_time, sizeof creation_time);
656 read_string (r, file_label, sizeof file_label);
659 strcpy (info->creation_date, creation_date);
660 strcpy (info->creation_time, creation_time);
661 info->integer_format = r->integer_format;
662 info->float_format = r->float_format;
663 info->compressed = r->compressed;
664 info->case_cnt = r->case_cnt;
666 product = ss_cstr (eye_catcher);
667 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
668 ss_trim (&product, ss_cstr (" "));
669 str_copy_buf_trunc (info->product, sizeof info->product,
670 ss_data (product), ss_length (product));
672 *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
675 /* Reads a variable (type 2) record from R into RECORD. */
677 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
679 int has_variable_label;
681 memset (record, 0, sizeof *record);
683 record->pos = r->pos;
684 record->width = read_int (r);
685 has_variable_label = read_int (r);
686 record->missing_value_code = read_int (r);
687 record->print_format = read_int (r);
688 record->write_format = read_int (r);
689 read_bytes (r, record->name, sizeof record->name);
691 if (has_variable_label == 1)
693 enum { MAX_LABEL_LEN = 255 };
694 size_t len, read_len;
698 /* Read up to MAX_LABEL_LEN bytes of label. */
699 read_len = MIN (MAX_LABEL_LEN, len);
700 record->label = xmalloc (read_len + 1);
701 read_string (r, record->label, read_len + 1);
703 /* Skip unread label bytes. */
704 skip_bytes (r, len - read_len);
706 /* Skip label padding up to multiple of 4 bytes. */
707 skip_bytes (r, ROUND_UP (len, 4) - len);
709 else if (has_variable_label != 0)
710 sys_error (r, record->pos,
711 _("Variable label indicator field is not 0 or 1."));
713 /* Set missing values. */
714 if (record->missing_value_code != 0)
716 int code = record->missing_value_code;
717 if (record->width == 0)
719 if (code < -3 || code > 3 || code == -1)
720 sys_error (r, record->pos,
721 _("Numeric missing value indicator field is not "
722 "-3, -2, 0, 1, 2, or 3."));
726 if (code < 1 || code > 3)
727 sys_error (r, record->pos,
728 _("String missing value indicator field is not "
732 read_bytes (r, record->missing, 8 * abs (code));
736 /* Reads value labels from R into RECORD. */
738 read_value_label_record (struct sfm_reader *r,
739 struct sfm_value_label_record *record,
744 /* Read type 3 record. */
745 record->pos = r->pos;
746 record->n_labels = read_int (r);
747 if (record->n_labels > SIZE_MAX / sizeof *record->labels)
748 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
750 record->labels = pool_nmalloc (r->pool, record->n_labels,
751 sizeof *record->labels);
752 for (i = 0; i < record->n_labels; i++)
754 struct sfm_value_label *label = &record->labels[i];
755 unsigned char label_len;
758 read_bytes (r, label->value, sizeof label->value);
760 /* Read label length. */
761 read_bytes (r, &label_len, sizeof label_len);
762 padded_len = ROUND_UP (label_len + 1, 8);
764 /* Read label, padding. */
765 label->label = pool_malloc (r->pool, padded_len + 1);
766 read_bytes (r, label->label, padded_len - 1);
767 label->label[label_len] = '\0';
770 /* Read record type of type 4 record. */
771 if (read_int (r) != 4)
772 sys_error (r, r->pos - 4,
773 _("Variable index record (type 4) does not immediately "
774 "follow value label record (type 3) as it should."));
776 /* Read number of variables associated with value label from type 4
778 record->n_vars = read_int (r);
779 if (record->n_vars < 1 || record->n_vars > n_vars)
780 sys_error (r, r->pos - 4,
781 _("Number of variables associated with a value label (%d) "
782 "is not between 1 and the number of variables (%zu)."),
783 record->n_vars, n_vars);
784 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
785 for (i = 0; i < record->n_vars; i++)
786 record->vars[i] = read_int (r);
789 /* Reads a document record from R and returns it. */
790 static struct sfm_document_record *
791 read_document_record (struct sfm_reader *r)
793 struct sfm_document_record *record;
796 record = pool_malloc (r->pool, sizeof *record);
797 record->pos = r->pos;
799 n_lines = read_int (r);
800 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
801 sys_error (r, record->pos,
802 _("Number of document lines (%d) "
803 "must be greater than 0 and less than %d."),
804 n_lines, INT_MAX / DOC_LINE_LENGTH);
806 record->n_lines = n_lines;
807 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
808 read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
814 read_extension_record_header (struct sfm_reader *r, int subtype,
815 struct sfm_extension_record *record)
817 record->pos = r->pos;
818 record->size = read_int (r);
819 record->count = read_int (r);
821 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
822 allows an extra byte for a null terminator, used by some
823 extension processing routines. */
824 if (record->size != 0
825 && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
826 sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
829 /* Reads an extension record from R into RECORD. */
830 static struct sfm_extension_record *
831 read_extension_record (struct sfm_reader *r, int subtype)
833 struct extension_record_type
840 static const struct extension_record_type types[] =
842 /* Implemented record types. */
843 { EXT_INTEGER, 4, 8 },
845 { EXT_MRSETS, 1, 0 },
846 { EXT_DISPLAY, 4, 0 },
847 { EXT_LONG_NAMES, 1, 0 },
848 { EXT_LONG_STRINGS, 1, 0 },
849 { EXT_NCASES, 8, 2 },
850 { EXT_FILE_ATTRS, 1, 0 },
851 { EXT_VAR_ATTRS, 1, 0 },
852 { EXT_MRSETS2, 1, 0 },
853 { EXT_ENCODING, 1, 0 },
854 { EXT_LONG_LABELS, 1, 0 },
856 /* Ignored record types. */
857 { EXT_VAR_SETS, 0, 0 },
859 { EXT_DATA_ENTRY, 0, 0 },
862 const struct extension_record_type *type;
863 struct sfm_extension_record *record;
866 record = pool_malloc (r->pool, sizeof *record);
867 read_extension_record_header (r, subtype, record);
868 n_bytes = record->count * record->size;
870 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
871 if (subtype == type->subtype)
873 if (type->size > 0 && record->size != type->size)
874 sys_warn (r, record->pos,
875 _("Record type 7, subtype %d has bad size %zu "
876 "(expected %d)."), subtype, record->size, type->size);
877 else if (type->count > 0 && record->count != type->count)
878 sys_warn (r, record->pos,
879 _("Record type 7, subtype %d has bad count %zu "
880 "(expected %d)."), subtype, record->count, type->count);
881 else if (type->count == 0 && type->size == 0)
883 /* Ignore this record. */
887 char *data = pool_malloc (r->pool, n_bytes + 1);
888 data[n_bytes] = '\0';
891 read_bytes (r, record->data, n_bytes);
898 sys_warn (r, record->pos,
899 _("Unrecognized record type 7, subtype %d. Please send a "
900 "copy of this file, and the syntax which created it to %s."),
901 subtype, PACKAGE_BUGREPORT);
904 skip_bytes (r, n_bytes);
909 skip_extension_record (struct sfm_reader *r, int subtype)
911 struct sfm_extension_record record;
913 read_extension_record_header (r, subtype, &record);
914 skip_bytes (r, record.count * record.size);
918 parse_file_label (struct sfm_reader *r, const char *file_label,
919 struct dictionary *dict)
921 char *utf8_file_label;
922 size_t file_label_len;
924 utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
925 file_label, -1, r->pool);
926 file_label_len = strlen (utf8_file_label);
927 while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
929 utf8_file_label[file_label_len] = '\0';
930 dict_set_label (dict, utf8_file_label);
933 /* Reads a variable (type 2) record from R and adds the
934 corresponding variable to DICT.
935 Also skips past additional variable records for long string
938 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
939 struct sfm_var_record *var_recs, size_t n_var_recs)
941 const char *dict_encoding = dict_get_encoding (dict);
942 struct sfm_var_record *rec;
945 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
947 struct variable *var;
952 name = recode_string_pool ("UTF-8", dict_encoding,
953 rec->name, 8, r->pool);
954 name[strcspn (name, " ")] = '\0';
956 if (!var_is_valid_name (name, false) || name[0] == '$' || name[0] == '#')
957 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
959 if (rec->width < 0 || rec->width > 255)
960 sys_error (r, rec->pos,
961 _("Bad width %d for variable %s."), rec->width, name);
963 var = rec->var = dict_create_var (dict, name, rec->width);
965 sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
967 /* Set the short name the same as the long name. */
968 var_set_short_name (var, 0, name);
970 /* Get variable label, if any. */
975 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
976 rec->label, -1, r->pool);
977 var_set_label (var, utf8_label);
980 /* Set missing values. */
981 if (rec->missing_value_code != 0)
983 int width = var_get_width (var);
984 struct missing_values mv;
986 mv_init_pool (r->pool, &mv, width);
987 if (var_is_numeric (var))
989 bool has_range = rec->missing_value_code < 0;
990 int n_discrete = (has_range
991 ? rec->missing_value_code == -3
992 : rec->missing_value_code);
997 double low = parse_float (r, rec->missing, 0);
998 double high = parse_float (r, rec->missing, 8);
999 mv_add_range (&mv, low, high);
1003 for (i = 0; i < n_discrete; i++)
1005 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1013 value_init_pool (r->pool, &value, width);
1014 value_set_missing (&value, width);
1015 for (i = 0; i < rec->missing_value_code; i++)
1017 uint8_t *s = value_str_rw (&value, width);
1018 memcpy (s, rec->missing + 8 * i, MIN (width, 8));
1019 mv_add_str (&mv, s);
1022 var_set_missing_values (var, &mv);
1026 parse_format_spec (r, rec->pos + 12, rec->print_format,
1027 PRINT_FORMAT, var, &n_warnings);
1028 parse_format_spec (r, rec->pos + 16, rec->write_format,
1029 WRITE_FORMAT, var, &n_warnings);
1031 /* Account for values.
1032 Skip long string continuation records, if any. */
1033 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1034 for (i = 1; i < n_values; i++)
1035 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1036 sys_error (r, rec->pos, _("Missing string continuation record."));
1041 /* Translates the format spec from sysfile format to internal
1044 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1045 enum which_format which, struct variable *v,
1048 const int max_warnings = 8;
1049 uint8_t raw_type = format >> 16;
1050 uint8_t w = format >> 8;
1056 if (!fmt_from_io (raw_type, &f.type))
1057 sys_error (r, pos, _("Unknown variable format %"PRIu8"."), raw_type);
1062 ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
1067 if (which == PRINT_FORMAT)
1068 var_set_print_format (v, &f);
1070 var_set_write_format (v, &f);
1072 else if (++*n_warnings <= max_warnings)
1074 char fmt_string[FMT_STRING_LEN_MAX + 1];
1075 sys_warn (r, pos, _("%s variable %s has invalid %s format %s."),
1076 var_is_numeric (v) ? _("Numeric") : _("String"),
1078 which == PRINT_FORMAT ? _("print") : _("write"),
1079 fmt_to_string (&f, fmt_string));
1081 if (*n_warnings == max_warnings)
1082 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1087 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1091 for (p = record->documents;
1092 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1093 p += DOC_LINE_LENGTH)
1095 struct substring line;
1097 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1098 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1099 ss_rtrim (&line, ss_cstr (" "));
1100 line.string[line.length] = '\0';
1102 dict_add_document_line (dict, line.string);
1108 /* Parses record type 7, subtype 3. */
1110 parse_machine_integer_info (struct sfm_reader *r,
1111 const struct sfm_extension_record *record,
1112 struct sfm_read_info *info)
1114 int float_representation, expected_float_format;
1115 int integer_representation, expected_integer_format;
1117 /* Save version info. */
1118 info->version_major = parse_int (r, record->data, 0);
1119 info->version_minor = parse_int (r, record->data, 4);
1120 info->version_revision = parse_int (r, record->data, 8);
1122 /* Check floating point format. */
1123 float_representation = parse_int (r, record->data, 16);
1124 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1125 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1126 expected_float_format = 1;
1127 else if (r->float_format == FLOAT_Z_LONG)
1128 expected_float_format = 2;
1129 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1130 expected_float_format = 3;
1133 if (float_representation != expected_float_format)
1134 sys_error (r, record->pos, _("Floating-point representation indicated by "
1135 "system file (%d) differs from expected (%d)."),
1136 float_representation, expected_float_format);
1138 /* Check integer format. */
1139 integer_representation = parse_int (r, record->data, 24);
1140 if (r->integer_format == INTEGER_MSB_FIRST)
1141 expected_integer_format = 1;
1142 else if (r->integer_format == INTEGER_LSB_FIRST)
1143 expected_integer_format = 2;
1146 if (integer_representation != expected_integer_format)
1147 sys_warn (r, record->pos,
1148 _("Integer format indicated by system file (%d) "
1149 "differs from expected (%d)."),
1150 integer_representation, expected_integer_format);
1155 choose_encoding (struct sfm_reader *r,
1156 const struct sfm_extension_record *ext_integer,
1157 const struct sfm_extension_record *ext_encoding)
1159 /* The EXT_ENCODING record is a more reliable way to determine dictionary
1162 return ext_encoding->data;
1164 /* But EXT_INTEGER is better than nothing as a fallback. */
1167 int codepage = parse_int (r, ext_integer->data, 7 * 4);
1176 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
1177 respectively. However, there are known to be many files in the wild
1178 with character code 2, yet have data which are clearly not ASCII.
1179 Therefore we ignore these values. */
1192 return pool_asprintf (r->pool, "CP%d", codepage);
1196 return locale_charset ();
1199 /* Parses record type 7, subtype 4. */
1201 parse_machine_float_info (struct sfm_reader *r,
1202 const struct sfm_extension_record *record)
1204 double sysmis = parse_float (r, record->data, 0);
1205 double highest = parse_float (r, record->data, 8);
1206 double lowest = parse_float (r, record->data, 16);
1208 if (sysmis != SYSMIS)
1209 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1212 if (highest != HIGHEST)
1213 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1214 highest, "HIGHEST");
1216 if (lowest != LOWEST)
1217 sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
1221 /* Parses record type 7, subtype 7 or 19. */
1223 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1224 struct dictionary *dict)
1226 struct text_record *text;
1227 struct mrset *mrset;
1229 text = open_text_record (r, record);
1232 const char *counted = NULL;
1235 struct stringi_set var_names;
1236 size_t allocated_vars;
1240 mrset = xzalloc (sizeof *mrset);
1242 name = text_get_token (text, ss_cstr ("="), NULL);
1245 mrset->name = xstrdup (name);
1247 if (mrset->name[0] != '$')
1249 sys_warn (r, record->pos,
1250 _("`%s' does not begin with `$' at UTF-8 offset %zu "
1251 "in MRSETS record."), mrset->name, text_pos (text));
1255 if (text_match (text, 'C'))
1257 mrset->type = MRSET_MC;
1258 if (!text_match (text, ' '))
1260 sys_warn (r, record->pos,
1261 _("Missing space following `%c' at UTF-8 offset %zu "
1262 "in MRSETS record."), 'C', text_pos (text));
1266 else if (text_match (text, 'D'))
1268 mrset->type = MRSET_MD;
1269 mrset->cat_source = MRSET_VARLABELS;
1271 else if (text_match (text, 'E'))
1275 mrset->type = MRSET_MD;
1276 mrset->cat_source = MRSET_COUNTEDVALUES;
1277 if (!text_match (text, ' '))
1279 sys_warn (r, record->pos,
1280 _("Missing space following `%c' at UTF-8 offset %zu "
1281 "in MRSETS record."), 'E', text_pos (text));
1285 number = text_get_token (text, ss_cstr (" "), NULL);
1286 if (!strcmp (number, "11"))
1287 mrset->label_from_var_label = true;
1288 else if (strcmp (number, "1"))
1289 sys_warn (r, record->pos,
1290 _("Unexpected label source value `%s' following `E' "
1291 "at UTF-8 offset %zu in MRSETS record."),
1292 number, text_pos (text));
1296 sys_warn (r, record->pos,
1297 _("Missing `C', `D', or `E' at UTF-8 offset %zu "
1298 "in MRSETS record."),
1303 if (mrset->type == MRSET_MD)
1305 counted = text_parse_counted_string (r, text);
1306 if (counted == NULL)
1310 label = text_parse_counted_string (r, text);
1313 mrset->label = label[0] != '\0' ? xstrdup (label) : NULL;
1315 stringi_set_init (&var_names);
1320 struct variable *var;
1321 const char *var_name;
1323 var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
1324 if (var_name == NULL)
1326 sys_warn (r, record->pos,
1327 _("Missing new-line parsing variable names "
1328 "at UTF-8 offset %zu in MRSETS record."),
1333 var = dict_lookup_var (dict, var_name);
1336 if (!stringi_set_insert (&var_names, var_name))
1338 sys_warn (r, record->pos,
1339 _("Duplicate variable name %s "
1340 "at UTF-8 offset %zu in MRSETS record."),
1341 var_name, text_pos (text));
1345 if (mrset->label == NULL && mrset->label_from_var_label
1346 && var_has_label (var))
1347 mrset->label = xstrdup (var_get_label (var));
1350 && var_get_type (var) != var_get_type (mrset->vars[0]))
1352 sys_warn (r, record->pos,
1353 _("MRSET %s contains both string and "
1354 "numeric variables."), name);
1357 width = MIN (width, var_get_width (var));
1359 if (mrset->n_vars >= allocated_vars)
1360 mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
1361 sizeof *mrset->vars);
1362 mrset->vars[mrset->n_vars++] = var;
1364 while (delimiter != '\n');
1366 if (mrset->n_vars < 2)
1368 sys_warn (r, record->pos,
1369 _("MRSET %s has only %zu variables."), mrset->name,
1371 mrset_destroy (mrset);
1375 if (mrset->type == MRSET_MD)
1377 mrset->width = width;
1378 value_init (&mrset->counted, width);
1380 mrset->counted.f = strtod (counted, NULL);
1382 value_copy_str_rpad (&mrset->counted, width,
1383 (const uint8_t *) counted, ' ');
1386 dict_add_mrset (dict, mrset);
1388 stringi_set_destroy (&var_names);
1390 mrset_destroy (mrset);
1391 close_text_record (r, text);
1394 /* Read record type 7, subtype 11, which specifies how variables
1395 should be displayed in GUI environments. */
1397 parse_display_parameters (struct sfm_reader *r,
1398 const struct sfm_extension_record *record,
1399 struct dictionary *dict)
1401 bool includes_width;
1402 bool warned = false;
1407 n_vars = dict_get_var_cnt (dict);
1408 if (record->count == 3 * n_vars)
1409 includes_width = true;
1410 else if (record->count == 2 * n_vars)
1411 includes_width = false;
1414 sys_warn (r, record->pos,
1415 _("Extension 11 has bad count %zu (for %zu variables)."),
1416 record->count, n_vars);
1421 for (i = 0; i < n_vars; ++i)
1423 struct variable *v = dict_get_var (dict, i);
1424 int measure, width, align;
1426 measure = parse_int (r, record->data, ofs);
1431 width = parse_int (r, record->data, ofs);
1437 align = parse_int (r, record->data, ofs);
1440 /* SPSS 14 sometimes seems to set string variables' measure
1442 if (0 == measure && var_is_alpha (v))
1445 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1448 sys_warn (r, record->pos,
1449 _("Invalid variable display parameters for variable "
1450 "%zu (%s). Default parameters substituted."),
1451 i, var_get_name (v));
1456 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1457 : measure == 2 ? MEASURE_ORDINAL
1459 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1460 : align == 1 ? ALIGN_RIGHT
1463 /* Older versions (SPSS 9.0) sometimes set the display
1464 width to zero. This causes confusion in the GUI, so
1465 only set the width if it is nonzero. */
1467 var_set_display_width (v, width);
1472 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1473 const char *new_name)
1475 size_t n_short_names;
1479 /* Renaming a variable may clear its short names, but we
1480 want to retain them, so we save them and re-set them
1482 n_short_names = var_get_short_name_cnt (var);
1483 short_names = xnmalloc (n_short_names, sizeof *short_names);
1484 for (i = 0; i < n_short_names; i++)
1486 const char *s = var_get_short_name (var, i);
1487 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1490 /* Set long name. */
1491 dict_rename_var (dict, var, new_name);
1493 /* Restore short names. */
1494 for (i = 0; i < n_short_names; i++)
1496 var_set_short_name (var, i, short_names[i]);
1497 free (short_names[i]);
1502 /* Parses record type 7, subtype 13, which gives the long name that corresponds
1503 to each short name. Modifies variable names in DICT accordingly. */
1505 parse_long_var_name_map (struct sfm_reader *r,
1506 const struct sfm_extension_record *record,
1507 struct dictionary *dict)
1509 struct text_record *text;
1510 struct variable *var;
1515 /* Convert variable names to lowercase. */
1518 for (i = 0; i < dict_get_var_cnt (dict); i++)
1520 struct variable *var = dict_get_var (dict, i);
1523 new_name = xstrdup (var_get_name (var));
1524 str_lowercase (new_name);
1526 rename_var_and_save_short_names (dict, var, new_name);
1534 /* Rename each of the variables, one by one. (In a correctly constructed
1535 system file, this cannot create any intermediate duplicate variable names,
1536 because all of the new variable names are longer than any of the old
1537 variable names and thus there cannot be any overlaps.) */
1538 text = open_text_record (r, record);
1539 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
1541 /* Validate long name. */
1542 if (!var_is_valid_name (long_name, false))
1544 sys_warn (r, record->pos,
1545 _("Long variable mapping from %s to invalid "
1546 "variable name `%s'."),
1547 var_get_name (var), long_name);
1551 /* Identify any duplicates. */
1552 if (strcasecmp (var_get_short_name (var, 0), long_name)
1553 && dict_lookup_var (dict, long_name) != NULL)
1555 sys_warn (r, record->pos,
1556 _("Duplicate long variable name `%s'."), long_name);
1560 rename_var_and_save_short_names (dict, var, long_name);
1562 close_text_record (r, text);
1565 /* Reads record type 7, subtype 14, which gives the real length
1566 of each very long string. Rearranges DICT accordingly. */
1568 parse_long_string_map (struct sfm_reader *r,
1569 const struct sfm_extension_record *record,
1570 struct dictionary *dict)
1572 struct text_record *text;
1573 struct variable *var;
1576 text = open_text_record (r, record);
1577 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
1579 size_t idx = var_get_dict_index (var);
1585 length = strtol (length_s, NULL, 10);
1586 if (length < 1 || length > MAX_STRING)
1588 sys_warn (r, record->pos,
1589 _("%s listed as string of invalid length %s "
1590 "in very long string record."),
1591 var_get_name (var), length_s);
1595 /* Check segments. */
1596 segment_cnt = sfm_width_to_segments (length);
1597 if (segment_cnt == 1)
1599 sys_warn (r, record->pos,
1600 _("%s listed in very long string record with width %s, "
1601 "which requires only one segment."),
1602 var_get_name (var), length_s);
1605 if (idx + segment_cnt > dict_get_var_cnt (dict))
1606 sys_error (r, record->pos,
1607 _("Very long string %s overflows dictionary."),
1608 var_get_name (var));
1610 /* Get the short names from the segments and check their
1612 for (i = 0; i < segment_cnt; i++)
1614 struct variable *seg = dict_get_var (dict, idx + i);
1615 int alloc_width = sfm_segment_alloc_width (length, i);
1616 int width = var_get_width (seg);
1619 var_set_short_name (var, i, var_get_short_name (seg, 0));
1620 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
1621 sys_error (r, record->pos,
1622 _("Very long string with width %ld has segment %d "
1623 "of width %d (expected %d)."),
1624 length, i, width, alloc_width);
1626 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
1627 var_set_width (var, length);
1629 close_text_record (r, text);
1630 dict_compact_values (dict);
1634 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
1635 const struct sfm_var_record *var_recs, size_t n_var_recs,
1636 const struct sfm_value_label_record *record)
1638 struct variable **vars;
1642 utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
1643 for (i = 0; i < record->n_labels; i++)
1644 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
1645 record->labels[i].label, -1,
1648 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
1649 for (i = 0; i < record->n_vars; i++)
1650 vars[i] = lookup_var_by_index (r, record->pos,
1651 var_recs, n_var_recs, record->vars[i]);
1653 for (i = 1; i < record->n_vars; i++)
1654 if (var_get_type (vars[i]) != var_get_type (vars[0]))
1655 sys_error (r, record->pos,
1656 _("Variables associated with value label are not all of "
1657 "identical type. Variable %s is %s, but variable "
1659 var_get_name (vars[0]),
1660 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
1661 var_get_name (vars[i]),
1662 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
1664 for (i = 0; i < record->n_vars; i++)
1666 struct variable *var = vars[i];
1670 width = var_get_width (var);
1672 sys_error (r, record->pos,
1673 _("Value labels may not be added to long string "
1674 "variables (e.g. %s) using records types 3 and 4."),
1675 var_get_name (var));
1677 for (j = 0; j < record->n_labels; j++)
1679 struct sfm_value_label *label = &record->labels[j];
1682 value_init (&value, width);
1684 value.f = parse_float (r, label->value, 0);
1686 memcpy (value_str_rw (&value, width), label->value, width);
1688 if (!var_add_value_label (var, &value, utf8_labels[j]))
1690 if (var_is_numeric (var))
1691 sys_warn (r, record->pos,
1692 _("Duplicate value label for %g on %s."),
1693 value.f, var_get_name (var));
1695 sys_warn (r, record->pos,
1696 _("Duplicate value label for `%.*s' on %s."),
1697 width, value_str (&value, width),
1698 var_get_name (var));
1701 value_destroy (&value, width);
1705 pool_free (r->pool, vars);
1706 for (i = 0; i < record->n_labels; i++)
1707 pool_free (r->pool, utf8_labels[i]);
1708 pool_free (r->pool, utf8_labels);
1711 static struct variable *
1712 lookup_var_by_index (struct sfm_reader *r, off_t offset,
1713 const struct sfm_var_record *var_recs, size_t n_var_recs,
1716 const struct sfm_var_record *rec;
1718 if (idx < 1 || idx > n_var_recs)
1720 sys_error (r, offset,
1721 _("Variable index %d not in valid range 1...%d."),
1726 rec = &var_recs[idx - 1];
1727 if (rec->var == NULL)
1729 sys_error (r, offset,
1730 _("Variable index %d refers to long string continuation."),
1738 /* Parses a set of custom attributes from TEXT into ATTRS.
1739 ATTRS may be a null pointer, in which case the attributes are
1740 read but discarded. */
1742 parse_attributes (struct sfm_reader *r, struct text_record *text,
1743 struct attrset *attrs)
1747 struct attribute *attr;
1751 /* Parse the key. */
1752 key = text_get_token (text, ss_cstr ("("), NULL);
1756 attr = attribute_create (key);
1757 for (index = 1; ; index++)
1759 /* Parse the value. */
1763 value = text_get_token (text, ss_cstr ("\n"), NULL);
1766 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
1771 length = strlen (value);
1772 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
1774 value[length - 1] = '\0';
1775 attribute_add_value (attr, value + 1);
1780 _("Attribute value %s[%d] is not quoted: %s."),
1782 attribute_add_value (attr, value);
1785 /* Was this the last value for this attribute? */
1786 if (text_match (text, ')'))
1790 attrset_add (attrs, attr);
1792 attribute_destroy (attr);
1794 while (!text_match (text, '/'));
1797 /* Reads record type 7, subtype 17, which lists custom
1798 attributes on the data file. */
1800 parse_data_file_attributes (struct sfm_reader *r,
1801 const struct sfm_extension_record *record,
1802 struct dictionary *dict)
1804 struct text_record *text = open_text_record (r, record);
1805 parse_attributes (r, text, dict_get_attributes (dict));
1806 close_text_record (r, text);
1809 /* Parses record type 7, subtype 18, which lists custom
1810 attributes on individual variables. */
1812 parse_variable_attributes (struct sfm_reader *r,
1813 const struct sfm_extension_record *record,
1814 struct dictionary *dict)
1816 struct text_record *text;
1817 struct variable *var;
1819 text = open_text_record (r, record);
1820 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
1821 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
1822 close_text_record (r, text);
1826 check_overflow (struct sfm_reader *r,
1827 const struct sfm_extension_record *record,
1828 size_t ofs, size_t length)
1830 size_t end = record->size * record->count;
1831 if (length >= end || ofs + length > end)
1832 sys_error (r, record->pos + end,
1833 _("Long string value label record ends unexpectedly."));
1837 parse_long_string_value_labels (struct sfm_reader *r,
1838 const struct sfm_extension_record *record,
1839 struct dictionary *dict)
1841 const char *dict_encoding = dict_get_encoding (dict);
1842 size_t end = record->size * record->count;
1849 struct variable *var;
1854 /* Parse variable name length. */
1855 check_overflow (r, record, ofs, 4);
1856 var_name_len = parse_int (r, record->data, ofs);
1859 /* Parse variable name, width, and number of labels. */
1860 check_overflow (r, record, ofs, var_name_len + 8);
1861 var_name = recode_string_pool ("UTF-8", dict_encoding,
1862 (const char *) record->data + ofs,
1863 var_name_len, r->pool);
1864 width = parse_int (r, record->data, ofs + var_name_len);
1865 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
1866 ofs += var_name_len + 8;
1868 /* Look up 'var' and validate. */
1869 var = dict_lookup_var (dict, var_name);
1871 sys_warn (r, record->pos + ofs,
1872 _("Ignoring long string value record for "
1873 "unknown variable %s."), var_name);
1874 else if (var_is_numeric (var))
1876 sys_warn (r, record->pos + ofs,
1877 _("Ignoring long string value record for "
1878 "numeric variable %s."), var_name);
1881 else if (width != var_get_width (var))
1883 sys_warn (r, record->pos + ofs,
1884 _("Ignoring long string value record for variable %s "
1885 "because the record's width (%d) does not match the "
1886 "variable's width (%d)."),
1887 var_name, width, var_get_width (var));
1892 value_init_pool (r->pool, &value, width);
1893 for (i = 0; i < n_labels; i++)
1895 size_t value_length, label_length;
1896 bool skip = var == NULL;
1898 /* Parse value length. */
1899 check_overflow (r, record, ofs, 4);
1900 value_length = parse_int (r, record->data, ofs);
1904 check_overflow (r, record, ofs, value_length);
1907 if (value_length == width)
1908 memcpy (value_str_rw (&value, width),
1909 (const uint8_t *) record->data + ofs, width);
1912 sys_warn (r, record->pos + ofs,
1913 _("Ignoring long string value %zu for variable "
1914 "%s, with width %d, that has bad value "
1916 i, var_get_name (var), width, value_length);
1920 ofs += value_length;
1922 /* Parse label length. */
1923 check_overflow (r, record, ofs, 4);
1924 label_length = parse_int (r, record->data, ofs);
1928 check_overflow (r, record, ofs, label_length);
1933 label = recode_string_pool ("UTF-8", dict_encoding,
1934 (const char *) record->data + ofs,
1935 label_length, r->pool);
1936 if (!var_add_value_label (var, &value, label))
1937 sys_warn (r, record->pos + ofs,
1938 _("Duplicate value label for `%.*s' on %s."),
1939 width, value_str (&value, width),
1940 var_get_name (var));
1941 pool_free (r->pool, label);
1943 ofs += label_length;
1950 static void partial_record (struct sfm_reader *r)
1953 static void read_error (struct casereader *, const struct sfm_reader *);
1955 static bool read_case_number (struct sfm_reader *, double *);
1956 static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
1957 static int read_opcode (struct sfm_reader *);
1958 static bool read_compressed_number (struct sfm_reader *, double *);
1959 static bool read_compressed_string (struct sfm_reader *, uint8_t *);
1960 static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
1961 static bool skip_whole_strings (struct sfm_reader *, size_t);
1963 /* Reads and returns one case from READER's file. Returns a null
1964 pointer if not successful. */
1965 static struct ccase *
1966 sys_file_casereader_read (struct casereader *reader, void *r_)
1968 struct sfm_reader *r = r_;
1969 struct ccase *volatile c;
1975 c = case_create (r->proto);
1976 if (setjmp (r->bail_out))
1978 casereader_force_error (reader);
1983 for (i = 0; i < r->sfm_var_cnt; i++)
1985 struct sfm_var *sv = &r->sfm_vars[i];
1986 union value *v = case_data_rw_idx (c, sv->case_index);
1988 if (sv->var_width == 0)
1990 if (!read_case_number (r, &v->f))
1995 uint8_t *s = value_str_rw (v, sv->var_width);
1996 if (!read_case_string (r, s + sv->offset, sv->segment_width))
1998 if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
2007 if (r->case_cnt != -1)
2008 read_error (reader, r);
2013 /* Issues an error that R ends in a partial record. */
2015 partial_record (struct sfm_reader *r)
2017 sys_error (r, r->pos, _("File ends in partial case."));
2020 /* Issues an error that an unspecified error occurred SFM, and
2023 read_error (struct casereader *r, const struct sfm_reader *sfm)
2025 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2026 casereader_force_error (r);
2029 /* Reads a number from R and stores its value in *D.
2030 If R is compressed, reads a compressed number;
2031 otherwise, reads a number in the regular way.
2032 Returns true if successful, false if end of file is
2033 reached immediately. */
2035 read_case_number (struct sfm_reader *r, double *d)
2040 if (!try_read_bytes (r, number, sizeof number))
2042 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2046 return read_compressed_number (r, d);
2049 /* Reads LENGTH string bytes from R into S.
2050 Always reads a multiple of 8 bytes; if LENGTH is not a
2051 multiple of 8, then extra bytes are read and discarded without
2053 Reads compressed strings if S is compressed.
2054 Returns true if successful, false if end of file is
2055 reached immediately. */
2057 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2059 size_t whole = ROUND_DOWN (length, 8);
2060 size_t partial = length % 8;
2064 if (!read_whole_strings (r, s, whole))
2071 if (!read_whole_strings (r, bounce, sizeof bounce))
2077 memcpy (s + whole, bounce, partial);
2083 /* Reads and returns the next compression opcode from R. */
2085 read_opcode (struct sfm_reader *r)
2087 assert (r->compressed);
2091 if (r->opcode_idx >= sizeof r->opcodes)
2093 if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
2097 opcode = r->opcodes[r->opcode_idx++];
2104 /* Reads a compressed number from R and stores its value in D.
2105 Returns true if successful, false if end of file is
2106 reached immediately. */
2108 read_compressed_number (struct sfm_reader *r, double *d)
2110 int opcode = read_opcode (r);
2118 *d = read_float (r);
2122 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2123 if (!r->corruption_warning)
2125 r->corruption_warning = true;
2126 sys_warn (r, r->pos,
2127 _("Possible compressed data corruption: "
2128 "compressed spaces appear in numeric field."));
2137 *d = opcode - r->bias;
2144 /* Reads a compressed 8-byte string segment from R and stores it
2146 Returns true if successful, false if end of file is
2147 reached immediately. */
2149 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2151 int opcode = read_opcode (r);
2159 read_bytes (r, dst, 8);
2163 memset (dst, ' ', 8);
2168 double value = opcode - r->bias;
2169 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2172 /* This has actually been seen "in the wild". The submitter of the
2173 file that showed that the contents decoded as spaces, but they
2174 were at the end of the field so it's possible that the null
2175 bytes just acted as null terminators. */
2177 else if (!r->corruption_warning)
2179 r->corruption_warning = true;
2180 sys_warn (r, r->pos,
2181 _("Possible compressed data corruption: "
2182 "string contains compressed integer (opcode %d)."),
2192 /* Reads LENGTH string bytes from R into S.
2193 LENGTH must be a multiple of 8.
2194 Reads compressed strings if S is compressed.
2195 Returns true if successful, false if end of file is
2196 reached immediately. */
2198 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2200 assert (length % 8 == 0);
2202 return try_read_bytes (r, s, length);
2206 for (ofs = 0; ofs < length; ofs += 8)
2207 if (!read_compressed_string (r, s + ofs))
2217 /* Skips LENGTH string bytes from R.
2218 LENGTH must be a multiple of 8.
2219 (LENGTH is also limited to 1024, but that's only because the
2220 current caller never needs more than that many bytes.)
2221 Returns true if successful, false if end of file is
2222 reached immediately. */
2224 skip_whole_strings (struct sfm_reader *r, size_t length)
2226 uint8_t buffer[1024];
2227 assert (length < sizeof buffer);
2228 return read_whole_strings (r, buffer, length);
2231 /* Helpers for reading records that contain structured text
2234 /* Maximum number of warnings to issue for a single text
2236 #define MAX_TEXT_WARNINGS 5
2241 struct substring buffer; /* Record contents, in UTF-8. */
2242 off_t start; /* Starting offset in file. */
2243 size_t pos; /* Current position in buffer. */
2244 int n_warnings; /* Number of warnings issued or suppressed. */
2247 static struct text_record *
2248 open_text_record (struct sfm_reader *r,
2249 const struct sfm_extension_record *record)
2251 struct text_record *text;
2252 struct substring raw;
2254 text = pool_alloc (r->pool, sizeof *text);
2255 raw = ss_buffer (record->data, record->size * record->count);
2256 text->start = record->pos;
2257 text->buffer = recode_substring_pool ("UTF-8", r->encoding, raw, r->pool);
2259 text->n_warnings = 0;
2264 /* Closes TEXT, frees its storage, and issues a final warning
2265 about suppressed warnings if necesary. */
2267 close_text_record (struct sfm_reader *r, struct text_record *text)
2269 if (text->n_warnings > MAX_TEXT_WARNINGS)
2270 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2271 text->n_warnings - MAX_TEXT_WARNINGS);
2272 pool_free (r->pool, ss_data (text->buffer));
2275 /* Reads a variable=value pair from TEXT.
2276 Looks up the variable in DICT and stores it into *VAR.
2277 Stores a null-terminated value into *VALUE. */
2279 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2280 struct text_record *text,
2281 struct variable **var, char **value)
2285 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2288 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2292 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2293 ss_buffer ("\t\0", 2));
2301 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2302 struct text_record *text, struct substring delimiters,
2303 struct variable **var)
2307 name = text_get_token (text, delimiters, NULL);
2311 *var = dict_lookup_var (dict, name);
2315 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2322 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
2323 struct text_record *text, struct substring delimiters,
2324 struct variable **var)
2326 char *short_name = text_get_token (text, delimiters, NULL);
2327 if (short_name == NULL)
2330 *var = dict_lookup_var (dict, short_name);
2332 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
2337 /* Displays a warning for the current file position, limiting the
2338 number to MAX_TEXT_WARNINGS for TEXT. */
2340 text_warn (struct sfm_reader *r, struct text_record *text,
2341 const char *format, ...)
2343 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
2347 va_start (args, format);
2348 sys_msg (r, text->start + text->pos, MW, format, args);
2354 text_get_token (struct text_record *text, struct substring delimiters,
2357 struct substring token;
2360 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
2363 end = &ss_data (token)[ss_length (token)];
2364 if (delimiter != NULL)
2367 return ss_data (token);
2370 /* Reads a integer value expressed in decimal, then a space, then a string that
2371 consists of exactly as many bytes as specified by the integer, then a space,
2372 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
2373 buffer (so the caller should not free the string). */
2375 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
2385 int c = text->buffer.string[text->pos];
2386 if (c < '0' || c > '9')
2388 n = (n * 10) + (c - '0');
2391 if (start == text->pos)
2393 sys_warn (r, text->start,
2394 _("Expecting digit at UTF-8 offset %zu in MRSETS record."),
2399 if (!text_match (text, ' '))
2401 sys_warn (r, text->start,
2402 _("Expecting space at UTF-8 offset %zu in MRSETS record."),
2407 if (text->pos + n > text->buffer.length)
2409 sys_warn (r, text->start,
2410 _("%zu-byte string starting at UTF-8 offset %zu "
2411 "exceeds record length %zu."),
2412 n, text->pos, text->buffer.length);
2416 s = &text->buffer.string[text->pos];
2419 sys_warn (r, text->start,
2420 _("Expecting space at UTF-8 offset %zu following %zu-byte "
2431 text_match (struct text_record *text, char c)
2433 if (text->buffer.string[text->pos] == c)
2442 /* Returns the current byte offset (as convertd to UTF-8) inside the TEXT's
2445 text_pos (const struct text_record *text)
2452 /* Displays a corruption message. */
2454 sys_msg (struct sfm_reader *r, off_t offset,
2455 int class, const char *format, va_list args)
2460 ds_init_empty (&text);
2462 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
2463 fh_get_file_name (r->fh), (long long int) offset);
2465 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
2466 ds_put_vformat (&text, format, args);
2468 m.category = msg_class_to_category (class);
2469 m.severity = msg_class_to_severity (class);
2470 m.where.file_name = NULL;
2471 m.where.line_number = 0;
2472 m.where.first_column = 0;
2473 m.where.last_column = 0;
2474 m.text = ds_cstr (&text);
2479 /* Displays a warning for offset OFFSET in the file. */
2481 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
2485 va_start (args, format);
2486 sys_msg (r, offset, MW, format, args);
2490 /* Displays an error for the current file position,
2491 marks it as in an error state,
2492 and aborts reading it using longjmp. */
2494 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
2498 va_start (args, format);
2499 sys_msg (r, offset, ME, format, args);
2503 longjmp (r->bail_out, 1);
2506 /* Reads BYTE_CNT bytes into BUF.
2507 Returns true if exactly BYTE_CNT bytes are successfully read.
2508 Aborts if an I/O error or a partial read occurs.
2509 If EOF_IS_OK, then an immediate end-of-file causes false to be
2510 returned; otherwise, immediate end-of-file causes an abort
2513 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
2514 void *buf, size_t byte_cnt)
2516 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
2517 r->pos += bytes_read;
2518 if (bytes_read == byte_cnt)
2520 else if (ferror (r->file))
2521 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
2522 else if (!eof_is_ok || bytes_read != 0)
2523 sys_error (r, r->pos, _("Unexpected end of file."));
2528 /* Reads BYTE_CNT into BUF.
2529 Aborts upon I/O error or if end-of-file is encountered. */
2531 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2533 read_bytes_internal (r, false, buf, byte_cnt);
2536 /* Reads BYTE_CNT bytes into BUF.
2537 Returns true if exactly BYTE_CNT bytes are successfully read.
2538 Returns false if an immediate end-of-file is encountered.
2539 Aborts if an I/O error or a partial read occurs. */
2541 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
2543 return read_bytes_internal (r, true, buf, byte_cnt);
2546 /* Reads a 32-bit signed integer from R and returns its value in
2549 read_int (struct sfm_reader *r)
2552 read_bytes (r, integer, sizeof integer);
2553 return integer_get (r->integer_format, integer, sizeof integer);
2556 /* Reads a 64-bit floating-point number from R and returns its
2557 value in host format. */
2559 read_float (struct sfm_reader *r)
2562 read_bytes (r, number, sizeof number);
2563 return float_get_double (r->float_format, number);
2567 parse_int (struct sfm_reader *r, const void *data, size_t ofs)
2569 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
2573 parse_float (struct sfm_reader *r, const void *data, size_t ofs)
2575 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
2578 /* Reads exactly SIZE - 1 bytes into BUFFER
2579 and stores a null byte into BUFFER[SIZE - 1]. */
2581 read_string (struct sfm_reader *r, char *buffer, size_t size)
2584 read_bytes (r, buffer, size - 1);
2585 buffer[size - 1] = '\0';
2588 /* Skips BYTES bytes forward in R. */
2590 skip_bytes (struct sfm_reader *r, size_t bytes)
2595 size_t chunk = MIN (sizeof buffer, bytes);
2596 read_bytes (r, buffer, chunk);
2601 static const struct casereader_class sys_file_casereader_class =
2603 sys_file_casereader_read,
2604 sys_file_casereader_destroy,