1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016, 2021 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-private.h"
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "data/varset.h"
45 #include "libpspp/array.h"
46 #include "libpspp/assertion.h"
47 #include "libpspp/compiler.h"
48 #include "libpspp/i18n.h"
49 #include "libpspp/ll.h"
50 #include "libpspp/message.h"
51 #include "libpspp/misc.h"
52 #include "libpspp/pool.h"
53 #include "libpspp/str.h"
54 #include "libpspp/stringi-set.h"
56 #include "gl/c-strtod.h"
57 #include "gl/c-ctype.h"
58 #include "gl/inttostr.h"
59 #include "gl/localcharset.h"
60 #include "gl/minmax.h"
61 #include "gl/unlocked-io.h"
62 #include "gl/xalloc.h"
63 #include "gl/xalloc-oversized.h"
67 #define _(msgid) gettext (msgid)
68 #define N_(msgid) (msgid)
72 /* subtypes 0-2 unknown */
73 EXT_INTEGER = 3, /* Machine integer info. */
74 EXT_FLOAT = 4, /* Machine floating-point info. */
75 EXT_VAR_SETS = 5, /* Variable sets. */
76 EXT_DATE = 6, /* DATE. */
77 EXT_MRSETS = 7, /* Multiple response sets. */
78 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
79 /* subtype 9 unknown */
80 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
81 EXT_DISPLAY = 11, /* Variable display parameters. */
82 /* subtype 12 unknown */
83 EXT_LONG_NAMES = 13, /* Long variable names. */
84 EXT_LONG_STRINGS = 14, /* Long strings. */
85 /* subtype 15 unknown */
86 EXT_NCASES = 16, /* Extended number of cases. */
87 EXT_FILE_ATTRS = 17, /* Data file attributes. */
88 EXT_VAR_ATTRS = 18, /* Variable attributes. */
89 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
90 EXT_ENCODING = 20, /* Character encoding. */
91 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
92 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
93 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
96 /* Fields from the top-level header record. */
97 struct sfm_header_record
99 char magic[5]; /* First 4 bytes of file, then null. */
100 int weight_idx; /* 0 if unweighted, otherwise a var index. */
101 int nominal_case_size; /* Number of var positions. */
103 /* These correspond to the members of struct any_file_info or a dictionary
104 but in the system file's encoding rather than ASCII. */
105 char creation_date[10]; /* "dd mmm yy". */
106 char creation_time[9]; /* "hh:mm:ss". */
107 char eye_catcher[61]; /* Eye-catcher string, then product name. */
108 char file_label[65]; /* File label. */
111 struct sfm_var_record
118 int missing_value_code;
121 struct variable *var;
124 struct sfm_value_label
130 struct sfm_value_label_record
133 struct sfm_value_label *labels;
134 unsigned int n_labels;
140 struct sfm_document_record
149 const char *name; /* Name. */
150 const char *label; /* Human-readable label for group. */
151 enum mrset_type type; /* Group type. */
152 const char **vars; /* Constituent variables' names. */
153 size_t n_vars; /* Number of constituent variables. */
156 enum mrset_md_cat_source cat_source; /* Source of category labels. */
157 bool label_from_var_label; /* 'label' taken from variable label? */
158 const char *counted; /* Counted value, as string. */
161 struct sfm_extension_record
163 struct ll ll; /* In struct sfm_reader 'var_attrs' list. */
164 int subtype; /* Record subtype. */
165 off_t pos; /* Starting offset in file. */
166 unsigned int size; /* Size of data elements. */
167 unsigned int count; /* Number of data elements. */
168 void *data; /* Contents. */
171 /* System file reader. */
174 struct any_reader any_reader;
176 /* Resource tracking. */
177 struct pool *pool; /* All system file state. */
180 struct any_read_info info;
181 struct sfm_header_record header;
182 struct sfm_var_record *vars;
184 struct sfm_value_label_record *labels;
186 struct sfm_document_record *document;
187 struct sfm_mrset *mrsets;
189 struct sfm_extension_record *extensions[32];
190 struct ll_list var_attrs; /* Contains "struct sfm_extension_record"s. */
193 struct file_handle *fh; /* File handle. */
194 struct fh_lock *lock; /* Mutual exclusion for file handle. */
195 FILE *file; /* File stream. */
196 off_t pos; /* Position in file. */
197 bool error; /* I/O or corruption error? */
198 struct caseproto *proto; /* Format of output cases. */
201 enum integer_format integer_format; /* On-disk integer format. */
202 enum float_format float_format; /* On-disk floating point format. */
203 struct sfm_var *sfm_vars; /* Variables. */
204 size_t sfm_n_vars; /* Number of variables. */
205 int n_cases; /* Number of cases */
206 const char *encoding; /* String encoding. */
207 bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */
210 enum any_compression compression;
211 double bias; /* Compression bias, usually 100.0. */
212 uint8_t opcodes[8]; /* Current block of opcodes. */
213 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
214 bool corruption_warning; /* Warned about possible corruption? */
216 /* ZLIB decompression. */
217 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
218 #define ZIN_BUF_SIZE 4096
219 uint8_t *zin_buf; /* Inflation input buffer. */
220 #define ZOUT_BUF_SIZE 16384
221 uint8_t *zout_buf; /* Inflation output buffer. */
222 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
223 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
224 z_stream zstream; /* ZLIB inflater. */
227 static const struct casereader_class sys_file_casereader_class;
229 static struct sfm_reader *
230 sfm_reader_cast (const struct any_reader *r_)
232 assert (r_->klass == &sys_file_reader_class);
233 return UP_CAST (r_, struct sfm_reader, any_reader);
236 static bool sfm_close (struct any_reader *);
238 static void sys_msg (struct sfm_reader *r, off_t, int class,
239 const char *format, va_list args)
240 PRINTF_FORMAT (4, 0);
241 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
242 PRINTF_FORMAT (3, 4);
243 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
244 PRINTF_FORMAT (3, 4);
246 static bool read_bytes (struct sfm_reader *, void *, size_t)
248 static int try_read_bytes (struct sfm_reader *, void *, size_t)
250 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
251 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
252 static bool read_int64 (struct sfm_reader *, long long int *)
254 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
256 static bool read_string (struct sfm_reader *, char *, size_t)
258 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
260 /* ZLIB compressed data handling. */
261 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
262 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
263 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
264 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
266 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
268 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
270 static bool read_compressed_float (struct sfm_reader *, double *)
273 static char *fix_line_ends (const char *);
275 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
276 static double parse_float (const struct sfm_reader *,
277 const void *data, size_t ofs);
279 static bool read_variable_record (struct sfm_reader *,
280 struct sfm_var_record *);
281 static bool read_value_label_record (struct sfm_reader *,
282 struct sfm_value_label_record *);
283 static bool read_document_record (struct sfm_reader *);
284 static bool read_extension_record (struct sfm_reader *, int subtype,
285 struct sfm_extension_record **);
286 static bool skip_extension_record (struct sfm_reader *, int subtype);
288 static struct text_record *open_text_record (
289 struct sfm_reader *, const struct sfm_extension_record *,
290 bool recode_to_utf8);
291 static void close_text_record (struct sfm_reader *,
292 struct text_record *);
293 static bool read_variable_to_value_pair (struct sfm_reader *,
295 struct text_record *,
296 struct variable **var, char **value);
297 static void text_warn (struct sfm_reader *r, struct text_record *text,
298 const char *format, ...) PRINTF_FORMAT (3, 4);
299 static char *text_get_token (struct text_record *,
300 struct substring delimiters, char *delimiter);
301 static bool text_match (struct text_record *, char c);
302 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
303 struct text_record *,
304 struct substring delimiters,
306 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
307 struct text_record *,
308 struct substring delimiters,
310 static const char *text_parse_counted_string (struct sfm_reader *,
311 struct text_record *);
312 static size_t text_pos (const struct text_record *);
313 static const char *text_get_all (const struct text_record *);
315 /* Dictionary reader. */
323 static bool read_dictionary (struct sfm_reader *);
324 static bool read_record (struct sfm_reader *, int type,
325 size_t *allocated_vars, size_t *allocated_labels);
326 static bool read_header (struct sfm_reader *, struct any_read_info *,
327 struct sfm_header_record *);
328 static void parse_header (struct sfm_reader *,
329 const struct sfm_header_record *,
330 struct any_read_info *, struct dictionary *);
331 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
332 struct sfm_var_record *, size_t n);
333 static void parse_format_spec (struct sfm_reader *, off_t pos,
334 unsigned int format, enum which_format,
335 struct variable *, int *format_n_warnings);
336 static void parse_document (struct dictionary *, struct sfm_document_record *);
337 static void parse_display_parameters (struct sfm_reader *,
338 const struct sfm_extension_record *,
339 struct dictionary *);
340 static bool parse_machine_integer_info (struct sfm_reader *,
341 const struct sfm_extension_record *,
342 struct any_read_info *);
343 static void parse_machine_float_info (struct sfm_reader *,
344 const struct sfm_extension_record *);
345 static void parse_extra_product_info (struct sfm_reader *,
346 const struct sfm_extension_record *,
347 struct any_read_info *);
348 static void parse_mrsets (struct sfm_reader *,
349 const struct sfm_extension_record *,
350 size_t *allocated_mrsets);
351 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
352 static void parse_long_var_name_map (struct sfm_reader *,
353 const struct sfm_extension_record *,
354 struct dictionary *);
355 static bool parse_long_string_map (struct sfm_reader *,
356 const struct sfm_extension_record *,
357 struct dictionary *);
358 static void parse_value_labels (struct sfm_reader *, struct dictionary *);
359 static struct variable *parse_weight_var (struct sfm_reader *,
360 const struct sfm_var_record *, size_t n_var_recs,
362 static void parse_data_file_attributes (struct sfm_reader *,
363 const struct sfm_extension_record *,
364 struct dictionary *);
365 static void parse_variable_attributes (struct sfm_reader *,
366 const struct sfm_extension_record *,
367 struct dictionary *);
368 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
369 static void parse_long_string_value_labels (struct sfm_reader *,
370 const struct sfm_extension_record *,
371 struct dictionary *);
372 static void parse_long_string_missing_values (
373 struct sfm_reader *, const struct sfm_extension_record *,
374 struct dictionary *);
375 static void parse_var_sets (struct sfm_reader *,
376 const struct sfm_extension_record *,
377 struct dictionary *);
379 /* Frees the strings inside INFO. */
381 any_read_info_destroy (struct any_read_info *info)
385 free (info->creation_date);
386 free (info->creation_time);
387 free (info->product);
388 free (info->product_ext);
392 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
393 successful, otherwise NULL. */
394 static struct any_reader *
395 sfm_open (struct file_handle *fh)
397 size_t allocated_mrsets = 0;
399 /* Create and initialize reader. */
400 struct sfm_reader *r = XZALLOC (struct sfm_reader);
401 r->any_reader.klass = &sys_file_reader_class;
402 r->pool = pool_create ();
403 pool_register (r->pool, free, r);
405 r->opcode_idx = sizeof r->opcodes;
406 ll_init (&r->var_attrs);
408 /* TRANSLATORS: this fragment will be interpolated into
409 messages in fh_lock() that identify types of files. */
410 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
414 r->file = fn_open (fh, "rb");
417 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
418 fh_get_file_name (r->fh), strerror (errno));
422 if (!read_dictionary (r))
425 if (r->extensions[EXT_MRSETS] != NULL)
426 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
428 if (r->extensions[EXT_MRSETS2] != NULL)
429 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
431 return &r->any_reader;
435 sfm_close (&r->any_reader);
440 read_dictionary (struct sfm_reader *r)
442 size_t allocated_vars;
443 size_t allocated_labels;
445 if (!read_header (r, &r->info, &r->header))
449 allocated_labels = 0;
454 if (!read_int (r, &type))
458 if (!read_record (r, type, &allocated_vars, &allocated_labels))
462 if (!skip_bytes (r, 4))
465 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
472 read_record (struct sfm_reader *r, int type,
473 size_t *allocated_vars, size_t *allocated_labels)
480 if (r->n_vars >= *allocated_vars)
481 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
483 return read_variable_record (r, &r->vars[r->n_vars++]);
486 if (r->n_labels >= *allocated_labels)
487 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
489 return read_value_label_record (r, &r->labels[r->n_labels++]);
492 /* A Type 4 record is always immediately after a type 3 record,
493 so the code for type 3 records reads the type 4 record too. */
494 sys_error (r, r->pos, _("Misplaced type 4 record."));
498 if (r->document != NULL)
499 sys_warn (r, r->pos, _("Duplicate type 6 (document) record."));
500 return read_document_record (r);
503 if (!read_int (r, &subtype))
506 || subtype >= sizeof r->extensions / sizeof *r->extensions)
509 _("Unrecognized record type 7, subtype %d. For help, "
510 "please send this file to %s and mention that you were "
512 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
513 return skip_extension_record (r, subtype);
515 else if (subtype == 18)
517 /* System files written by "Stata 14.1/-savespss- 1.77 by S.Radyakin"
518 put each variable attribute into a separate record with subtype
519 18. I'm surprised that SPSS puts up with this. */
520 struct sfm_extension_record *ext;
521 bool ok = read_extension_record (r, subtype, &ext);
523 ll_push_tail (&r->var_attrs, &ext->ll);
526 else if (r->extensions[subtype] != NULL)
529 _("Record type 7, subtype %d found here has the same "
530 "type as the record found near offset 0x%llx. For "
531 "help, please send this file to %s and mention that "
532 "you were using %s."),
533 subtype, (long long int) r->extensions[subtype]->pos,
534 PACKAGE_BUGREPORT, PACKAGE_STRING);
535 return skip_extension_record (r, subtype);
538 return read_extension_record (r, subtype, &r->extensions[subtype]);
541 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
548 /* Returns the character encoding obtained from R, or a null pointer if R
549 doesn't have an indication of its character encoding. */
551 sfm_get_encoding (const struct sfm_reader *r)
553 /* The EXT_ENCODING record is the best way to determine dictionary
555 if (r->extensions[EXT_ENCODING])
556 return r->extensions[EXT_ENCODING]->data;
558 /* But EXT_INTEGER is better than nothing as a fallback. */
559 if (r->extensions[EXT_INTEGER])
561 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
562 const char *encoding;
571 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
572 respectively. However, many files have character code 2 but data
573 which are clearly not ASCII. Therefore, ignore these values. */
580 encoding = sys_get_encoding_from_codepage (codepage);
581 if (encoding != NULL)
587 /* If the file magic number is EBCDIC then its character data is too. */
588 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
594 struct get_strings_aux
605 add_string__ (struct get_strings_aux *aux,
606 const char *string, bool id, char *title)
608 if (aux->n >= aux->allocated)
610 aux->allocated = 2 * (aux->allocated + 1);
611 aux->titles = pool_realloc (aux->pool, aux->titles,
612 aux->allocated * sizeof *aux->titles);
613 aux->strings = pool_realloc (aux->pool, aux->strings,
614 aux->allocated * sizeof *aux->strings);
615 aux->ids = pool_realloc (aux->pool, aux->ids,
616 aux->allocated * sizeof *aux->ids);
619 aux->titles[aux->n] = title;
620 aux->strings[aux->n] = pool_strdup (aux->pool, string);
621 aux->ids[aux->n] = id;
625 static void PRINTF_FORMAT (3, 4)
626 add_string (struct get_strings_aux *aux,
627 const char *string, const char *title, ...)
631 va_start (args, title);
632 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
636 static void PRINTF_FORMAT (3, 4)
637 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
641 va_start (args, title);
642 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
647 skip_prefix (const char *s, const char *prefix)
649 size_t prefix_len = strlen (prefix);
650 return !strncmp (s, prefix, prefix_len) ? s + prefix_len : s;
653 /* Retrieves significant string data from R in its raw format, to allow the
654 caller to try to detect the encoding in use.
656 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
657 and *STRINGSP to an array of N elements allocated from POOL. For each I in
658 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
659 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
660 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
663 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
664 char ***titlesp, bool **idsp, char ***stringsp)
666 struct sfm_reader *r = sfm_reader_cast (r_);
667 const struct sfm_mrset *mrset;
668 struct get_strings_aux aux;
680 for (i = 0; i < r->n_vars; i++)
681 if (r->vars[i].width != -1)
682 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
685 for (i = 0; i < r->n_vars; i++)
686 if (r->vars[i].width != -1)
689 if (r->vars[i].label)
690 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
695 for (i = 0; i < r->n_labels; i++)
696 for (j = 0; j < r->labels[i].n_labels; j++)
697 add_string (&aux, r->labels[i].labels[j].label,
698 _("Value Label %zu"), k++);
700 add_string (&aux, r->header.creation_date, _("Creation Date"));
701 add_string (&aux, r->header.creation_time, _("Creation Time"));
702 add_string (&aux, skip_prefix (r->header.eye_catcher, "@(#) "), _("Product"));
703 add_string (&aux, r->header.file_label, _("File Label"));
705 if (r->extensions[EXT_PRODUCT_INFO])
706 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
707 _("Extra Product Info"));
713 for (i = 0; i < r->document->n_lines; i++)
717 memcpy (line, r->document->documents + i * 80, 80);
720 add_string (&aux, line, _("Document Line %zu"), i + 1);
724 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
726 size_t mrset_idx = mrset - r->mrsets + 1;
728 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
730 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
732 /* Skip the variables because they ought to be duplicates. */
735 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
739 /* data file attributes */
740 /* variable attributes */
742 /* long string value labels */
743 /* long string missing values */
745 *titlesp = aux.titles;
747 *stringsp = aux.strings;
751 /* Decodes the dictionary read from R, saving it into *DICT. Character
752 strings in R are decoded using ENCODING, or an encoding obtained from R if
753 ENCODING is null, or the locale encoding if R specifies no encoding.
755 If INFOP is non-null, then it receives additional info about the system
756 file, which the caller must eventually free with any_read_info_destroy()
757 when it is no longer needed.
759 This function consumes R. The caller must use it again later, even to
760 destroy it with sfm_close(). */
761 static struct casereader *
762 sfm_decode (struct any_reader *r_, const char *encoding,
763 struct dictionary **dictp, struct any_read_info *infop)
765 struct sfm_reader *r = sfm_reader_cast (r_);
766 struct dictionary *dict;
768 if (encoding == NULL)
770 encoding = sfm_get_encoding (r);
771 if (encoding == NULL)
773 sys_warn (r, -1, _("This system file does not indicate its own "
774 "character encoding. Using default encoding "
775 "%s. For best results, specify an encoding "
776 "explicitly. Use SYSFILE INFO with "
777 "ENCODING=\"DETECT\" to analyze the possible "
780 encoding = locale_charset ();
784 dict = dict_create (encoding);
785 r->encoding = dict_get_encoding (dict);
787 /* These records don't use variables at all. */
788 if (r->document != NULL)
789 parse_document (dict, r->document);
791 if (r->extensions[EXT_INTEGER] != NULL
792 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
795 if (r->extensions[EXT_FLOAT] != NULL)
796 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
798 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
799 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
801 if (r->extensions[EXT_FILE_ATTRS] != NULL)
802 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
804 parse_header (r, &r->header, &r->info, dict);
806 /* Parse the variable records, the basis of almost everything else. */
807 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
810 /* Parse value labels and the weight variable immediately after the variable
811 records. These records use indexes into var_recs[], so we must parse them
812 before those indexes become invalidated by very long string variables. */
813 parse_value_labels (r, dict);
814 if (r->header.weight_idx != 0)
815 dict_set_weight (dict, parse_weight_var (r, r->vars, r->n_vars,
816 r->header.weight_idx));
818 if (r->extensions[EXT_DISPLAY] != NULL)
819 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
821 /* The following records use short names, so they need to be parsed before
822 parse_long_var_name_map() changes short names to long names. */
823 decode_mrsets (r, dict);
825 if (r->extensions[EXT_LONG_STRINGS] != NULL
826 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
829 /* Now rename variables to their long names. */
830 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
832 /* The following records use long names, so they need to follow renaming. */
833 if (!ll_is_empty (&r->var_attrs))
835 struct sfm_extension_record *ext;
836 ll_for_each (ext, struct sfm_extension_record, ll, &r->var_attrs)
837 parse_variable_attributes (r, ext, dict);
839 /* Roles use the $@Role attribute. */
840 assign_variable_roles (r, dict);
842 if (r->extensions[EXT_LONG_LABELS] != NULL)
843 parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
844 if (r->extensions[EXT_LONG_MISSING] != NULL)
845 parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
847 if (r->extensions[EXT_VAR_SETS])
848 parse_var_sets (r, r->extensions[EXT_VAR_SETS], dict);
850 /* Warn if the actual amount of data per case differs from the
851 amount that the header claims. SPSS version 13 gets this
852 wrong when very long strings are involved, so don't warn in
854 if (r->header.nominal_case_size > 0
855 && r->header.nominal_case_size != r->n_vars
856 && r->info.version_major != 13)
857 sys_warn (r, -1, _("File header claims %d variable positions but "
858 "%zu were read from file."),
859 r->header.nominal_case_size, r->n_vars);
861 /* Create an index of dictionary variable widths for
862 sfm_read_case to use. We cannot use the `struct variable's
863 from the dictionary we created, because the caller owns the
864 dictionary and may destroy or modify its variables. */
865 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_n_vars);
866 pool_register (r->pool, free, r->sfm_vars);
867 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
873 memset (&r->info, 0, sizeof r->info);
876 return casereader_create_sequential
877 (NULL, r->proto, r->n_cases == -1 ? CASENUMBER_MAX : r->n_cases,
878 &sys_file_casereader_class, r);
887 /* Closes R, which should have been returned by sfm_open() but not already
888 closed with sfm_decode() or this function.
889 Returns true if an I/O error has occurred on READER, false
892 sfm_close (struct any_reader *r_)
894 struct sfm_reader *r = sfm_reader_cast (r_);
899 if (fn_close (r->fh, r->file) == EOF)
901 msg (ME, _("Error closing system file `%s': %s."),
902 fh_get_file_name (r->fh), strerror (errno));
908 any_read_info_destroy (&r->info);
913 pool_destroy (r->pool);
918 /* Destroys READER. */
920 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
922 struct sfm_reader *r = r_;
923 sfm_close (&r->any_reader);
926 /* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
927 a negative errno value if there is an error reading FILE. */
929 sfm_detect (FILE *file)
933 if (fseek (file, 0, SEEK_SET) != 0)
935 if (fread (magic, 4, 1, file) != 1)
936 return ferror (file) ? -errno : 0;
939 return (!strcmp (ASCII_MAGIC, magic)
940 || !strcmp (ASCII_ZMAGIC, magic)
941 || !strcmp (EBCDIC_MAGIC, magic));
944 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
945 except for the string fields in *INFO, which parse_header() will initialize
946 later once the file's encoding is known. */
948 read_header (struct sfm_reader *r, struct any_read_info *info,
949 struct sfm_header_record *header)
951 uint8_t raw_layout_code[4];
956 if (!read_string (r, header->magic, sizeof header->magic)
957 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
959 r->written_by_readstat = strstr (header->eye_catcher,
960 "https://github.com/WizardMac/ReadStat");
962 if (!strcmp (ASCII_MAGIC, header->magic)
963 || !strcmp (EBCDIC_MAGIC, header->magic))
965 else if (!strcmp (ASCII_ZMAGIC, header->magic))
969 sys_error (r, 0, _("This is not an SPSS system file."));
973 /* Identify integer format. */
974 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
976 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
978 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
980 || (r->integer_format != INTEGER_MSB_FIRST
981 && r->integer_format != INTEGER_LSB_FIRST))
983 sys_error (r, 64, _("This is not an SPSS system file."));
987 if (!read_int (r, &header->nominal_case_size))
990 if (header->nominal_case_size < 0
991 || header->nominal_case_size > INT_MAX / 16)
992 header->nominal_case_size = -1;
994 if (!read_int (r, &compressed))
999 r->compression = ANY_COMP_NONE;
1000 else if (compressed == 1)
1001 r->compression = ANY_COMP_SIMPLE;
1004 sys_error (r, 0, "System file header has invalid compression "
1005 "value %d.", compressed);
1011 if (compressed == 2)
1012 r->compression = ANY_COMP_ZLIB;
1015 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1016 "compression value %d.", compressed);
1021 if (!read_int (r, &header->weight_idx))
1024 if (!read_int (r, &r->n_cases))
1026 if (r->n_cases > INT_MAX / 2)
1029 /* Identify floating-point format and obtain compression bias. */
1030 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1032 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1034 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1036 if (memcmp (raw_bias, zero_bias, 8))
1037 sys_warn (r, r->pos - 8,
1038 _("Compression bias is not the usual "
1039 "value of 100, or system file uses unrecognized "
1040 "floating-point format."));
1043 /* Some software is known to write all-zeros to this
1044 field. Such software also writes floating-point
1045 numbers in the format that we expect by default
1046 (it seems that all software most likely does, in
1047 reality), so don't warn in this case. */
1050 if (r->integer_format == INTEGER_MSB_FIRST)
1051 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1053 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1055 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1057 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1058 || !read_string (r, header->creation_time, sizeof header->creation_time)
1059 || !read_string (r, header->file_label, sizeof header->file_label)
1060 || !skip_bytes (r, 3))
1063 info->integer_format = r->integer_format;
1064 info->float_format = r->float_format;
1065 info->compression = r->compression;
1066 info->n_cases = r->n_cases;
1071 /* Reads a variable (type 2) record from R into RECORD. */
1073 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1075 int has_variable_label;
1077 memset (record, 0, sizeof *record);
1079 record->pos = r->pos;
1080 if (!read_int (r, &record->width)
1081 || !read_int (r, &has_variable_label)
1082 || !read_int (r, &record->missing_value_code)
1083 || !read_int (r, &record->print_format)
1084 || !read_int (r, &record->write_format)
1085 || !read_string (r, record->name, sizeof record->name))
1088 if (has_variable_label == 1)
1090 enum { MAX_LABEL_LEN = 65536 };
1091 unsigned int len, read_len;
1093 if (!read_uint (r, &len))
1096 /* Read up to MAX_LABEL_LEN bytes of label. */
1097 read_len = MIN (MAX_LABEL_LEN, len);
1098 record->label = pool_malloc (r->pool, read_len + 1);
1099 if (!read_string (r, record->label, read_len + 1))
1102 /* Skip unread label bytes. */
1103 if (!skip_bytes (r, len - read_len))
1106 /* Skip label padding up to multiple of 4 bytes. */
1107 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1110 else if (has_variable_label != 0)
1112 sys_error (r, record->pos,
1113 _("Variable label indicator field is not 0 or 1."));
1117 /* Set missing values. */
1118 if (record->missing_value_code != 0)
1120 int code = record->missing_value_code;
1121 if (record->width == 0)
1123 if (code < -3 || code > 3 || code == -1)
1125 sys_error (r, record->pos,
1126 _("Numeric missing value indicator field is not "
1127 "-3, -2, 0, 1, 2, or 3."));
1133 if (code < 1 || code > 3)
1135 sys_error (r, record->pos,
1136 _("String missing value indicator field is not "
1142 if (!read_bytes (r, record->missing, 8 * abs (code)))
1149 /* Reads value labels from R into RECORD. */
1151 read_value_label_record (struct sfm_reader *r,
1152 struct sfm_value_label_record *record)
1157 /* Read type 3 record. */
1158 record->pos = r->pos;
1159 if (!read_uint (r, &record->n_labels))
1161 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1163 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1167 record->labels = pool_nmalloc (r->pool, record->n_labels,
1168 sizeof *record->labels);
1169 for (i = 0; i < record->n_labels; i++)
1171 struct sfm_value_label *label = &record->labels[i];
1172 unsigned char label_len;
1175 if (!read_bytes (r, label->value, sizeof label->value))
1178 /* Read label length. */
1179 if (!read_bytes (r, &label_len, sizeof label_len))
1181 padded_len = ROUND_UP (label_len + 1, 8);
1183 /* Read label, padding. */
1184 label->label = pool_malloc (r->pool, padded_len + 1);
1185 if (!read_bytes (r, label->label, padded_len - 1))
1187 label->label[label_len] = '\0';
1190 /* Read record type of type 4 record. */
1191 if (!read_int (r, &type))
1195 sys_error (r, r->pos - 4,
1196 _("Variable index record (type 4) does not immediately "
1197 "follow value label record (type 3) as it should."));
1201 /* Read number of variables associated with value label from type 4
1203 if (!read_uint (r, &record->n_vars))
1205 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1207 sys_error (r, r->pos - 4,
1208 _("Number of variables associated with a value label (%u) "
1209 "is not between 1 and the number of variables (%zu)."),
1210 record->n_vars, r->n_vars);
1214 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1215 for (i = 0; i < record->n_vars; i++)
1216 if (!read_int (r, &record->vars[i]))
1222 /* Reads a document record from R. Returns true if successful, false on
1225 read_document_record (struct sfm_reader *r)
1228 if (!read_int (r, &n_lines))
1230 else if (n_lines == 0)
1232 else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1234 sys_error (r, r->pos,
1235 _("Number of document lines (%d) "
1236 "must be greater than 0 and less than %d."),
1237 n_lines, INT_MAX / DOC_LINE_LENGTH);
1241 struct sfm_document_record *record;
1242 record = pool_malloc (r->pool, sizeof *record);
1243 record->pos = r->pos;
1244 record->n_lines = n_lines;
1245 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1246 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1249 r->document = record;
1254 read_extension_record_header (struct sfm_reader *r, int subtype,
1255 struct sfm_extension_record *record)
1257 record->subtype = subtype;
1258 record->pos = r->pos;
1259 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1262 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1263 allows an extra byte for a null terminator, used by some
1264 extension processing routines. */
1265 if (record->size != 0
1266 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1268 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1276 /* Reads an extension record from R into RECORD. */
1278 read_extension_record (struct sfm_reader *r, int subtype,
1279 struct sfm_extension_record **recordp)
1281 struct extension_record_type
1288 static const struct extension_record_type types[] =
1290 /* Implemented record types. */
1291 { EXT_INTEGER, 4, 8 },
1292 { EXT_FLOAT, 8, 3 },
1293 { EXT_VAR_SETS, 1, 0 },
1294 { EXT_MRSETS, 1, 0 },
1295 { EXT_PRODUCT_INFO, 1, 0 },
1296 { EXT_DISPLAY, 4, 0 },
1297 { EXT_LONG_NAMES, 1, 0 },
1298 { EXT_LONG_STRINGS, 1, 0 },
1299 { EXT_NCASES, 8, 2 },
1300 { EXT_FILE_ATTRS, 1, 0 },
1301 { EXT_VAR_ATTRS, 1, 0 },
1302 { EXT_MRSETS2, 1, 0 },
1303 { EXT_ENCODING, 1, 0 },
1304 { EXT_LONG_LABELS, 1, 0 },
1305 { EXT_LONG_MISSING, 1, 0 },
1307 /* Ignored record types. */
1309 { EXT_DATA_ENTRY, 0, 0 },
1310 { EXT_DATAVIEW, 0, 0 },
1313 const struct extension_record_type *type;
1314 struct sfm_extension_record *record;
1318 record = pool_malloc (r->pool, sizeof *record);
1319 if (!read_extension_record_header (r, subtype, record))
1321 n_bytes = record->count * record->size;
1323 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1324 if (subtype == type->subtype)
1326 if (type->size > 0 && record->size != type->size)
1327 sys_warn (r, record->pos,
1328 _("Record type 7, subtype %d has bad size %u "
1329 "(expected %d)."), subtype, record->size, type->size);
1330 else if (type->count > 0 && record->count != type->count)
1331 sys_warn (r, record->pos,
1332 _("Record type 7, subtype %d has bad count %u "
1333 "(expected %d)."), subtype, record->count, type->count);
1334 else if (type->count == 0 && type->size == 0)
1336 /* Ignore this record. */
1340 char *data = pool_malloc (r->pool, n_bytes + 1);
1341 data[n_bytes] = '\0';
1343 record->data = data;
1344 if (!read_bytes (r, record->data, n_bytes))
1353 sys_warn (r, record->pos,
1354 _("Unrecognized record type 7, subtype %d. For help, please "
1355 "send this file to %s and mention that you were using %s."),
1356 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
1359 return skip_bytes (r, n_bytes);
1363 skip_extension_record (struct sfm_reader *r, int subtype)
1365 struct sfm_extension_record record;
1367 return (read_extension_record_header (r, subtype, &record)
1368 && skip_bytes (r, record.count * record.size));
1372 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1373 struct any_read_info *info, struct dictionary *dict)
1375 const char *dict_encoding = dict_get_encoding (dict);
1376 struct substring product;
1377 struct substring label;
1380 /* Convert file label to UTF-8 and put it into DICT. */
1381 label = recode_substring_pool ("UTF-8", dict_encoding,
1382 ss_cstr (header->file_label), r->pool);
1383 ss_trim (&label, ss_cstr (" "));
1384 label.string[label.length] = '\0';
1385 fixed_label = fix_line_ends (label.string);
1386 dict_set_label (dict, fixed_label);
1389 /* Put creation date and time in UTF-8 into INFO. */
1390 info->creation_date = recode_string ("UTF-8", dict_encoding,
1391 header->creation_date, -1);
1392 info->creation_time = recode_string ("UTF-8", dict_encoding,
1393 header->creation_time, -1);
1395 /* Put product name into INFO, dropping eye-catcher string if present. */
1396 product = recode_substring_pool ("UTF-8", dict_encoding,
1397 ss_cstr (header->eye_catcher), r->pool);
1398 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1399 ss_trim (&product, ss_cstr (" "));
1400 info->product = ss_xstrdup (product);
1403 static struct variable *
1404 add_var_with_generated_name (struct dictionary *dict, int width)
1406 char *name = dict_make_unique_var_name (dict, NULL, NULL);
1407 struct variable *var = dict_create_var_assert (dict, name, width);
1412 /* Reads a variable (type 2) record from R and adds the
1413 corresponding variable to DICT.
1414 Also skips past additional variable records for long string
1417 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1418 struct sfm_var_record *var_recs, size_t n_var_recs)
1420 const char *dict_encoding = dict_get_encoding (dict);
1421 struct sfm_var_record *rec;
1424 for (rec = var_recs; rec < &var_recs[n_var_recs];)
1430 name = recode_string_pool ("UTF-8", dict_encoding,
1431 rec->name, -1, r->pool);
1432 name[strcspn (name, " ")] = '\0';
1434 if (rec->width < 0 || rec->width > 255)
1436 sys_error (r, rec->pos,
1437 _("Bad width %d for variable %s."), rec->width, name);
1441 struct variable *var;
1442 if (!dict_id_is_valid (dict, name) || name[0] == '$' || name[0] == '#')
1444 var = add_var_with_generated_name (dict, rec->width);
1445 sys_warn (r, rec->pos, _("Renaming variable with invalid name "
1446 "`%s' to `%s'."), name, var_get_name (var));
1450 var = dict_create_var (dict, name, rec->width);
1453 var = add_var_with_generated_name (dict, rec->width);
1454 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1456 name, var_get_name (var));
1461 /* Set the short name the same as the long name (even if we renamed
1463 var_set_short_name (var, 0, var_get_name (var));
1465 /* Get variable label, if any. */
1470 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1471 rec->label, -1, r->pool);
1472 var_set_label (var, utf8_label);
1475 /* Set missing values. */
1476 if (rec->missing_value_code != 0)
1478 int width = var_get_width (var);
1479 struct missing_values mv;
1481 mv_init_pool (r->pool, &mv, width);
1482 if (var_is_numeric (var))
1484 bool has_range = rec->missing_value_code < 0;
1485 int n_discrete = (has_range
1486 ? rec->missing_value_code == -3
1487 : rec->missing_value_code);
1492 double low = parse_float (r, rec->missing, 0);
1493 double high = parse_float (r, rec->missing, 8);
1495 /* Deal with SPSS 21 change in representation. */
1499 mv_add_range (&mv, low, high);
1503 for (i = 0; i < n_discrete; i++)
1505 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1510 for (i = 0; i < rec->missing_value_code; i++)
1511 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1512 var_set_missing_values (var, &mv);
1516 parse_format_spec (r, rec->pos + 12, rec->print_format,
1517 PRINT_FORMAT, var, &n_warnings);
1518 parse_format_spec (r, rec->pos + 16, rec->write_format,
1519 WRITE_FORMAT, var, &n_warnings);
1521 /* Account for values.
1522 Skip long string continuation records, if any. */
1523 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1524 for (i = 1; i < n_values; i++)
1525 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1527 sys_error (r, rec->pos, _("Missing string continuation record."));
1536 /* Translates the format spec from sysfile format to internal
1539 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1540 enum which_format which, struct variable *v,
1543 const int max_warnings = 8;
1546 if (fmt_from_u32 (format, var_get_width (v), false, &f))
1548 if (which == PRINT_FORMAT)
1549 var_set_print_format (v, f);
1551 var_set_write_format (v, f);
1553 else if (format == 0)
1555 /* Actually observed in the wild. No point in warning about it. */
1557 else if (++*n_warnings <= max_warnings)
1559 if (which == PRINT_FORMAT)
1560 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1562 var_get_name (v), var_get_width (v), format);
1564 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1566 var_get_name (v), var_get_width (v), format);
1568 if (*n_warnings == max_warnings)
1569 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1574 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1578 for (p = record->documents;
1579 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1580 p += DOC_LINE_LENGTH)
1582 struct substring line;
1584 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1585 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1586 ss_rtrim (&line, ss_cstr (" "));
1587 line.string[line.length] = '\0';
1589 dict_add_document_line (dict, line.string, false);
1595 /* Parses record type 7, subtype 3. */
1597 parse_machine_integer_info (struct sfm_reader *r,
1598 const struct sfm_extension_record *record,
1599 struct any_read_info *info)
1601 int float_representation, expected_float_format;
1602 int integer_representation, expected_integer_format;
1604 /* Save version info. */
1605 info->version_major = parse_int (r, record->data, 0);
1606 info->version_minor = parse_int (r, record->data, 4);
1607 info->version_revision = parse_int (r, record->data, 8);
1609 /* Check floating point format. */
1610 float_representation = parse_int (r, record->data, 16);
1611 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1612 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1613 expected_float_format = 1;
1614 else if (r->float_format == FLOAT_Z_LONG)
1615 expected_float_format = 2;
1616 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1617 expected_float_format = 3;
1620 if (float_representation != expected_float_format)
1622 sys_error (r, record->pos,
1623 _("Floating-point representation indicated by "
1624 "system file (%d) differs from expected (%d)."),
1625 float_representation, expected_float_format);
1629 /* Check integer format. */
1630 integer_representation = parse_int (r, record->data, 24);
1631 if (r->integer_format == INTEGER_MSB_FIRST)
1632 expected_integer_format = 1;
1633 else if (r->integer_format == INTEGER_LSB_FIRST)
1634 expected_integer_format = 2;
1637 if (integer_representation != expected_integer_format)
1638 sys_warn (r, record->pos,
1639 _("Integer format indicated by system file (%d) "
1640 "differs from expected (%d)."),
1641 integer_representation, expected_integer_format);
1646 /* Parses record type 7, subtype 4. */
1648 parse_machine_float_info (struct sfm_reader *r,
1649 const struct sfm_extension_record *record)
1651 double sysmis = parse_float (r, record->data, 0);
1652 double highest = parse_float (r, record->data, 8);
1653 double lowest = parse_float (r, record->data, 16);
1655 if (sysmis != SYSMIS)
1656 sys_warn (r, record->pos,
1657 _("File specifies unexpected value %g (%a) as %s, "
1658 "instead of %g (%a)."),
1659 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1661 if (highest != HIGHEST)
1662 sys_warn (r, record->pos,
1663 _("File specifies unexpected value %g (%a) as %s, "
1664 "instead of %g (%a)."),
1665 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1667 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1668 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1669 appears in a context (missing values) where SYSMIS cannot. */
1670 if (lowest != LOWEST && lowest != SYSMIS)
1671 sys_warn (r, record->pos,
1672 _("File specifies unexpected value %g (%a) as %s, "
1673 "instead of %g (%a) or %g (%a)."),
1674 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1677 /* Parses record type 7, subtype 10. */
1679 parse_extra_product_info (struct sfm_reader *r,
1680 const struct sfm_extension_record *record,
1681 struct any_read_info *info)
1683 struct text_record *text;
1685 text = open_text_record (r, record, true);
1686 info->product_ext = fix_line_ends (text_get_all (text));
1687 close_text_record (r, text);
1690 /* Parses record type 7, subtype 7 or 19. */
1692 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1693 size_t *allocated_mrsets)
1695 struct text_record *text;
1697 text = open_text_record (r, record, false);
1700 struct sfm_mrset *mrset = NULL;
1701 size_t allocated_vars = 0;
1702 char delimiter = '4';
1704 /* Skip extra line feeds if present. */
1705 while (text_match (text, '\n'))
1708 if (r->n_mrsets >= *allocated_mrsets)
1709 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1711 mrset = &r->mrsets[r->n_mrsets];
1712 memset(mrset, 0, sizeof *mrset);
1714 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1715 if (mrset->name == NULL)
1718 if (text_match (text, 'C'))
1720 mrset->type = MRSET_MC;
1721 if (!text_match (text, ' '))
1723 sys_warn (r, record->pos,
1724 _("Missing space following `%c' at offset %zu "
1725 "in MRSETS record."), 'C', text_pos (text));
1729 else if (text_match (text, 'D'))
1731 mrset->type = MRSET_MD;
1732 mrset->cat_source = MRSET_VARLABELS;
1734 else if (text_match (text, 'E'))
1738 mrset->type = MRSET_MD;
1739 mrset->cat_source = MRSET_COUNTEDVALUES;
1740 if (!text_match (text, ' '))
1742 sys_warn (r, record->pos,
1743 _("Missing space following `%c' at offset %zu "
1744 "in MRSETS record."), 'E', text_pos (text));
1748 number = text_get_token (text, ss_cstr (" "), NULL);
1750 sys_warn (r, record->pos,
1751 _("Missing label source value "
1752 "following `E' at offset %zu in MRSETS record."),
1754 else if (!strcmp (number, "11"))
1755 mrset->label_from_var_label = true;
1756 else if (strcmp (number, "1"))
1757 sys_warn (r, record->pos,
1758 _("Unexpected label source value following `E' "
1759 "at offset %zu in MRSETS record."),
1764 sys_warn (r, record->pos,
1765 _("Missing `C', `D', or `E' at offset %zu "
1766 "in MRSETS record."),
1771 if (mrset->type == MRSET_MD)
1773 mrset->counted = text_parse_counted_string (r, text);
1774 if (mrset->counted == NULL)
1778 mrset->label = text_parse_counted_string (r, text);
1779 if (mrset->label == NULL)
1787 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1790 if (delimiter != '\n')
1791 sys_warn (r, record->pos,
1792 _("Missing new-line parsing variable names "
1793 "at offset %zu in MRSETS record."),
1798 if (mrset->n_vars >= allocated_vars)
1799 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1801 sizeof *mrset->vars);
1802 mrset->vars[mrset->n_vars++] = var;
1804 while (delimiter != '\n');
1808 close_text_record (r, text);
1812 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1814 const struct sfm_mrset *s;
1816 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1818 struct stringi_set var_names;
1819 struct mrset *mrset;
1824 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1825 if (!mrset_is_valid_name (name, dict_get_encoding (dict)))
1827 sys_warn (r, -1, _("Invalid multiple response set name `%s'."),
1833 mrset = xzalloc (sizeof *mrset);
1835 mrset->type = s->type;
1836 mrset->cat_source = s->cat_source;
1837 mrset->label_from_var_label = s->label_from_var_label;
1838 if (s->label[0] != '\0')
1839 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1841 stringi_set_init (&var_names);
1842 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1844 for (i = 0; i < s->n_vars; i++)
1846 struct variable *var;
1849 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1851 var = dict_lookup_var (dict, var_name);
1857 if (!stringi_set_insert (&var_names, var_name))
1860 _("MRSET %s contains duplicate variable name %s."),
1861 mrset->name, var_name);
1867 if (mrset->label == NULL && mrset->label_from_var_label
1868 && var_has_label (var))
1869 mrset->label = xstrdup (var_get_label (var));
1872 && var_get_type (var) != var_get_type (mrset->vars[0]))
1875 _("MRSET %s contains both string and "
1876 "numeric variables."), mrset->name);
1879 width = MIN (width, var_get_width (var));
1881 mrset->vars[mrset->n_vars++] = var;
1884 if (mrset->n_vars < 2)
1886 if (mrset->n_vars == 0)
1887 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1889 sys_warn (r, -1, _("MRSET %s has only one variable."),
1891 mrset_destroy (mrset);
1892 stringi_set_destroy (&var_names);
1896 if (mrset->type == MRSET_MD)
1898 mrset->width = width;
1899 value_init (&mrset->counted, width);
1901 mrset->counted.f = c_strtod (s->counted, NULL);
1903 value_copy_str_rpad (&mrset->counted, width,
1904 (const uint8_t *) s->counted, ' ');
1907 dict_add_mrset (dict, mrset);
1908 stringi_set_destroy (&var_names);
1912 /* Read record type 7, subtype 11, which specifies how variables
1913 should be displayed in GUI environments. */
1915 parse_display_parameters (struct sfm_reader *r,
1916 const struct sfm_extension_record *record,
1917 struct dictionary *dict)
1919 bool includes_width;
1920 bool warned = false;
1925 n_vars = dict_get_n_vars (dict);
1926 if (record->count == 3 * n_vars)
1927 includes_width = true;
1928 else if (record->count == 2 * n_vars)
1929 includes_width = false;
1932 sys_warn (r, record->pos,
1933 _("Extension 11 has bad count %u (for %zu variables)."),
1934 record->count, n_vars);
1939 for (i = 0; i < n_vars; ++i)
1941 struct variable *v = dict_get_var (dict, i);
1942 int measure, width, align;
1944 measure = parse_int (r, record->data, ofs);
1949 width = parse_int (r, record->data, ofs);
1955 align = parse_int (r, record->data, ofs);
1958 /* SPSS sometimes seems to set variables' measure to zero. */
1962 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1965 sys_warn (r, record->pos,
1966 _("Invalid variable display parameters for variable "
1967 "%zu (%s). Default parameters substituted."),
1968 i, var_get_name (v));
1973 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1974 : measure == 2 ? MEASURE_ORDINAL
1976 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1977 : align == 1 ? ALIGN_RIGHT
1980 /* Older versions (SPSS 9.0) sometimes set the display
1981 width to zero. This causes confusion in the GUI, so
1982 only set the width if it is nonzero. */
1984 var_set_display_width (v, width);
1989 rename_var_and_save_short_names (struct sfm_reader *r, off_t pos,
1990 struct dictionary *dict,
1991 struct variable *var, const char *new_name)
1993 size_t n_short_names;
1997 /* Renaming a variable may clear its short names, but we
1998 want to retain them, so we save them and re-set them
2000 n_short_names = var_get_n_short_names (var);
2001 short_names = xnmalloc (n_short_names, sizeof *short_names);
2002 for (i = 0; i < n_short_names; i++)
2004 const char *s = var_get_short_name (var, i);
2005 short_names[i] = xstrdup_if_nonnull (s);
2008 /* Set long name. */
2009 if (!dict_try_rename_var (dict, var, new_name))
2010 sys_warn (r, pos, _("Duplicate long variable name `%s'."), new_name);
2012 /* Restore short names. */
2013 for (i = 0; i < n_short_names; i++)
2015 var_set_short_name (var, i, short_names[i]);
2016 free (short_names[i]);
2021 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2022 to each short name. Modifies variable names in DICT accordingly. */
2024 parse_long_var_name_map (struct sfm_reader *r,
2025 const struct sfm_extension_record *record,
2026 struct dictionary *dict)
2028 struct text_record *text;
2029 struct variable *var;
2034 /* There are no long variable names. Use the short variable names,
2035 converted to lowercase, as the long variable names. */
2038 for (i = 0; i < dict_get_n_vars (dict); i++)
2040 struct variable *var = dict_get_var (dict, i);
2043 new_name = utf8_to_lower (var_get_name (var));
2044 rename_var_and_save_short_names (r, -1, dict, var, new_name);
2051 /* Rename each of the variables, one by one. (In a correctly constructed
2052 system file, this cannot create any intermediate duplicate variable names,
2053 because all of the new variable names are longer than any of the old
2054 variable names and thus there cannot be any overlaps.) */
2055 text = open_text_record (r, record, true);
2056 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2058 /* Validate long name. */
2059 if (!dict_id_is_valid (dict, long_name)
2060 || long_name[0] == '$' || long_name[0] == '#')
2062 sys_warn (r, record->pos,
2063 _("Long variable mapping from %s to invalid "
2064 "variable name `%s'."),
2065 var_get_name (var), long_name);
2069 rename_var_and_save_short_names (r, record->pos, dict, var, long_name);
2071 close_text_record (r, text);
2074 /* Reads record type 7, subtype 14, which gives the real length
2075 of each very long string. Rearranges DICT accordingly. */
2077 parse_long_string_map (struct sfm_reader *r,
2078 const struct sfm_extension_record *record,
2079 struct dictionary *dict)
2081 struct text_record *text;
2082 struct variable *var;
2085 text = open_text_record (r, record, true);
2086 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2088 size_t idx = var_get_dict_index (var);
2093 length = strtol (length_s, NULL, 10);
2094 if (length < 1 || length > MAX_STRING)
2096 sys_warn (r, record->pos,
2097 _("%s listed as string of invalid length %s "
2098 "in very long string record."),
2099 var_get_name (var), length_s);
2103 /* Check segments. */
2104 int n_segments = sfm_width_to_segments (length);
2105 if (n_segments == 1)
2107 sys_warn (r, record->pos,
2108 _("%s listed in very long string record with width %s, "
2109 "which requires only one segment."),
2110 var_get_name (var), length_s);
2113 if (idx + n_segments > dict_get_n_vars (dict))
2115 sys_error (r, record->pos,
2116 _("Very long string %s overflows dictionary."),
2117 var_get_name (var));
2121 /* Get the short names from the segments and check their
2123 for (i = 0; i < n_segments; i++)
2125 struct variable *seg = dict_get_var (dict, idx + i);
2126 int alloc_width = sfm_segment_alloc_width (length, i);
2127 int width = var_get_width (seg);
2130 var_set_short_name (var, i, var_get_short_name (seg, 0));
2131 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2133 sys_error (r, record->pos,
2134 _("Very long string with width %ld has segment %d "
2135 "of width %d (expected %d)."),
2136 length, i, width, alloc_width);
2140 dict_delete_consecutive_vars (dict, idx + 1, n_segments - 1);
2141 var_set_width (var, length);
2143 close_text_record (r, text);
2144 dict_compact_values (dict);
2149 #define MAX_LABEL_WARNINGS 5
2151 /* Displays a warning for offset OFFSET in the file. */
2153 value_label_warning (struct sfm_reader *r, off_t offset, int *n_label_warnings,
2154 const char *format, ...)
2156 if (++*n_label_warnings > MAX_LABEL_WARNINGS)
2161 va_start (args, format);
2162 sys_msg (r, offset, MW, format, args);
2166 #define MAX_LABEL_WARNINGS 5
2169 parse_one_value_label_set (struct sfm_reader *r, struct dictionary *dict,
2170 const struct sfm_var_record *var_recs,
2172 const struct sfm_value_label_record *record,
2173 int *n_label_warnings)
2176 = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2177 for (size_t i = 0; i < record->n_labels; i++)
2178 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2179 record->labels[i].label, -1,
2182 struct variable **vars = pool_nmalloc (r->pool,
2183 record->n_vars, sizeof *vars);
2184 unsigned int n_vars = 0;
2185 for (size_t i = 0; i < record->n_vars; i++)
2187 int idx = record->vars[i];
2188 if (idx < 1 || idx > n_var_recs)
2190 value_label_warning (
2191 r, record->pos, n_label_warnings,
2192 _("Value label variable index %d not in valid range 1...%zu."),
2197 const struct sfm_var_record *rec = &var_recs[idx - 1];
2198 if (rec->var == NULL)
2200 value_label_warning (
2201 r, record->pos, n_label_warnings,
2202 _("Value label variable index %d "
2203 "refers to long string continuation."), idx);
2207 vars[n_vars++] = rec->var;
2212 for (size_t i = 1; i < n_vars; i++)
2213 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2215 value_label_warning (
2216 r, record->pos, n_label_warnings,
2217 _("Variables associated with value label are not all of "
2218 "identical type. Variable %s is %s, but variable "
2220 var_get_name (vars[0]),
2221 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2222 var_get_name (vars[i]),
2223 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2227 for (size_t i = 0; i < n_vars; i++)
2229 struct variable *var = vars[i];
2230 int width = var_get_width (var);
2233 value_label_warning (
2234 r, record->pos, n_label_warnings,
2235 _("Value labels may not be added to long string "
2236 "variables (e.g. %s) using records types 3 and 4."),
2237 var_get_name (var));
2241 for (size_t j = 0; j < record->n_labels; j++)
2243 struct sfm_value_label *label = &record->labels[j];
2246 value_init (&value, width);
2248 value.f = parse_float (r, label->value, 0);
2250 memcpy (value.s, label->value, width);
2252 if (!var_add_value_label (var, &value, utf8_labels[j]))
2254 if (r->written_by_readstat)
2256 /* Ignore the problem. ReadStat is buggy and emits value
2257 labels whose values are longer than string variables'
2258 widths, that are identical in the actual width of the
2259 variable, e.g. both values "ABC123" and "ABC456" for a
2260 string variable with width 3. */
2262 else if (var_is_numeric (var))
2263 value_label_warning (r, record->pos, n_label_warnings,
2264 _("Duplicate value label for %g on %s."),
2265 value.f, var_get_name (var));
2267 value_label_warning (
2268 r, record->pos, n_label_warnings,
2269 _("Duplicate value label for `%.*s' on %s."),
2270 width, value.s, var_get_name (var));
2273 value_destroy (&value, width);
2277 pool_free (r->pool, vars);
2278 for (size_t i = 0; i < record->n_labels; i++)
2279 pool_free (r->pool, utf8_labels[i]);
2280 pool_free (r->pool, utf8_labels);
2284 parse_value_labels (struct sfm_reader *r, struct dictionary *dict)
2286 int n_label_warnings = 0;
2287 for (size_t i = 0; i < r->n_labels; i++)
2288 parse_one_value_label_set (r, dict, r->vars, r->n_vars, &r->labels[i],
2290 if (n_label_warnings > MAX_LABEL_WARNINGS)
2292 _("Suppressed %d additional warnings for value labels."),
2293 n_label_warnings - MAX_LABEL_WARNINGS);
2296 static struct variable *
2297 parse_weight_var (struct sfm_reader *r,
2298 const struct sfm_var_record *var_recs, size_t n_var_recs,
2301 off_t offset = 76; /* Offset to variable index in header. */
2303 if (idx < 1 || idx > n_var_recs)
2305 sys_warn (r, offset,
2306 _("Weight variable index %d not in valid range 1...%zu. "
2307 "Treating file as unweighted."),
2312 const struct sfm_var_record *rec = &var_recs[idx - 1];
2313 if (rec->var == NULL)
2315 sys_warn (r, offset,
2316 _("Weight variable index %d refers to long string "
2317 "continuation. Treating file as unweighted."), idx);
2321 struct variable *weight_var = rec->var;
2322 if (!var_is_numeric (weight_var))
2324 sys_warn (r, offset, _("Ignoring string variable `%s' set "
2325 "as weighting variable."),
2326 var_get_name (weight_var));
2333 /* Parses a set of custom attributes from TEXT into ATTRS.
2334 ATTRS may be a null pointer, in which case the attributes are
2335 read but discarded. */
2337 parse_attributes (struct sfm_reader *r, struct text_record *text,
2338 struct attrset *attrs)
2342 struct attribute *attr;
2346 /* Parse the key. */
2347 key = text_get_token (text, ss_cstr ("("), NULL);
2351 attr = attribute_create (key);
2352 for (index = 1; ; index++)
2354 /* Parse the value. */
2358 value = text_get_token (text, ss_cstr ("\n"), NULL);
2361 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2366 length = strlen (value);
2367 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2369 value[length - 1] = '\0';
2370 attribute_add_value (attr, value + 1);
2375 _("Attribute value %s[%d] is not quoted: %s."),
2377 attribute_add_value (attr, value);
2380 /* Was this the last value for this attribute? */
2381 if (text_match (text, ')'))
2384 if (attrs != NULL && attribute_get_n_values (attr) > 0)
2386 if (!attrset_try_add (attrs, attr))
2388 text_warn (r, text, _("Duplicate attribute %s."),
2389 attribute_get_name (attr));
2390 attribute_destroy (attr);
2394 attribute_destroy (attr);
2396 while (!text_match (text, '/'));
2399 /* Reads record type 7, subtype 17, which lists custom
2400 attributes on the data file. */
2402 parse_data_file_attributes (struct sfm_reader *r,
2403 const struct sfm_extension_record *record,
2404 struct dictionary *dict)
2406 struct text_record *text = open_text_record (r, record, true);
2407 parse_attributes (r, text, dict_get_attributes (dict));
2408 close_text_record (r, text);
2411 /* Parses record type 7, subtype 18, which lists custom
2412 attributes on individual variables. */
2414 parse_variable_attributes (struct sfm_reader *r,
2415 const struct sfm_extension_record *record,
2416 struct dictionary *dict)
2418 struct text_record *text;
2419 struct variable *var;
2421 text = open_text_record (r, record, true);
2422 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2423 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2424 close_text_record (r, text);
2428 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2430 size_t n_warnings = 0;
2433 for (i = 0; i < dict_get_n_vars (dict); i++)
2435 struct variable *var = dict_get_var (dict, i);
2436 struct attrset *attrs = var_get_attributes (var);
2437 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2438 if (attr != NULL && attribute_get_n_values (attr) > 0)
2440 int value = atoi (attribute_get_value (attr, 0));
2462 role = ROLE_PARTITION;
2471 if (n_warnings++ == 0)
2472 sys_warn (r, -1, _("Invalid role for variable %s."),
2473 var_get_name (var));
2476 var_set_role (var, role);
2481 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2486 check_overflow__ (const struct sfm_extension_record *record,
2487 size_t ofs, size_t length)
2489 size_t end = record->size * record->count;
2490 if (length >= end || ofs + length > end)
2496 check_overflow (struct sfm_reader *r,
2497 const struct sfm_extension_record *record,
2498 size_t ofs, size_t length)
2500 bool ok = check_overflow__ (record, ofs, length);
2502 sys_warn (r, record->pos + record->size * record->count,
2503 _("Extension record subtype %d ends unexpectedly."),
2509 parse_long_string_value_labels (struct sfm_reader *r,
2510 const struct sfm_extension_record *record,
2511 struct dictionary *dict)
2513 const char *dict_encoding = dict_get_encoding (dict);
2514 size_t end = record->size * record->count;
2521 struct variable *var;
2526 /* Parse variable name length. */
2527 if (!check_overflow (r, record, ofs, 4))
2529 var_name_len = parse_int (r, record->data, ofs);
2532 /* Parse variable name, width, and number of labels. */
2533 if (!check_overflow (r, record, ofs, var_name_len)
2534 || !check_overflow (r, record, ofs, var_name_len + 8))
2536 var_name = recode_string_pool ("UTF-8", dict_encoding,
2537 (const char *) record->data + ofs,
2538 var_name_len, r->pool);
2539 width = parse_int (r, record->data, ofs + var_name_len);
2540 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2541 ofs += var_name_len + 8;
2543 /* Look up 'var' and validate. */
2544 var = dict_lookup_var (dict, var_name);
2546 sys_warn (r, record->pos + ofs,
2547 _("Ignoring long string value label record for "
2548 "unknown variable %s."), var_name);
2549 else if (var_is_numeric (var))
2551 sys_warn (r, record->pos + ofs,
2552 _("Ignoring long string value label record for "
2553 "numeric variable %s."), var_name);
2556 else if (width != var_get_width (var))
2558 sys_warn (r, record->pos + ofs,
2559 _("Ignoring long string value label record for variable "
2560 "%s because the record's width (%d) does not match the "
2561 "variable's width (%d)."),
2562 var_name, width, var_get_width (var));
2567 value_init_pool (r->pool, &value, width);
2568 for (i = 0; i < n_labels; i++)
2570 size_t value_length, label_length;
2571 bool skip = var == NULL;
2573 /* Parse value length. */
2574 if (!check_overflow (r, record, ofs, 4))
2576 value_length = parse_int (r, record->data, ofs);
2580 if (!check_overflow (r, record, ofs, value_length))
2584 if (value_length == width)
2585 memcpy (value.s, (const uint8_t *) record->data + ofs, width);
2588 sys_warn (r, record->pos + ofs,
2589 _("Ignoring long string value label %zu for "
2590 "variable %s, with width %d, that has bad value "
2592 i, var_get_name (var), width, value_length);
2596 ofs += value_length;
2598 /* Parse label length. */
2599 if (!check_overflow (r, record, ofs, 4))
2601 label_length = parse_int (r, record->data, ofs);
2605 if (!check_overflow (r, record, ofs, label_length))
2611 label = recode_string_pool ("UTF-8", dict_encoding,
2612 (const char *) record->data + ofs,
2613 label_length, r->pool);
2614 if (!var_add_value_label (var, &value, label))
2615 sys_warn (r, record->pos + ofs,
2616 _("Duplicate value label for `%.*s' on %s."),
2617 width, value.s, var_get_name (var));
2618 pool_free (r->pool, label);
2620 ofs += label_length;
2626 parse_long_string_missing_values (struct sfm_reader *r,
2627 const struct sfm_extension_record *record,
2628 struct dictionary *dict)
2630 const char *dict_encoding = dict_get_encoding (dict);
2631 size_t end = record->size * record->count;
2634 bool warned = false;
2637 struct missing_values mv;
2639 struct variable *var;
2640 int n_missing_values;
2644 /* Parse variable name length. */
2645 if (!check_overflow (r, record, ofs, 4))
2647 var_name_len = parse_int (r, record->data, ofs);
2650 /* Parse variable name. */
2651 if (!check_overflow (r, record, ofs, var_name_len)
2652 || !check_overflow (r, record, ofs, var_name_len + 1))
2654 var_name = recode_string_pool ("UTF-8", dict_encoding,
2655 (const char *) record->data + ofs,
2656 var_name_len, r->pool);
2657 ofs += var_name_len;
2659 /* Parse number of missing values. */
2660 n_missing_values = ((const uint8_t *) record->data)[ofs];
2661 if (n_missing_values < 1 || n_missing_values > 3)
2662 sys_warn (r, record->pos + ofs,
2663 _("Long string missing values record says variable %s "
2664 "has %d missing values, but only 1 to 3 missing values "
2666 var_name, n_missing_values);
2669 /* Look up 'var' and validate. */
2670 var = dict_lookup_var (dict, var_name);
2672 sys_warn (r, record->pos + ofs,
2673 _("Ignoring long string missing value record for "
2674 "unknown variable %s."), var_name);
2675 else if (var_is_numeric (var))
2677 sys_warn (r, record->pos + ofs,
2678 _("Ignoring long string missing value record for "
2679 "numeric variable %s."), var_name);
2683 /* Parse value length. */
2684 if (!check_overflow (r, record, ofs, 4))
2686 size_t value_length = parse_int (r, record->data, ofs);
2690 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2691 for (i = 0; i < n_missing_values; i++)
2693 /* Tolerate files written by old, buggy versions of PSPP where we
2694 believed that the value_length was repeated before each missing
2696 if (check_overflow__ (record, ofs, value_length)
2697 && parse_int (r, record->data, ofs) == 8)
2701 sys_warn (r, record->pos + ofs,
2702 _("This file has corrupted metadata written by a "
2703 "buggy version of PSPP. To fix it, save a new "
2704 "copy of the file."));
2711 if (!check_overflow (r, record, ofs, value_length))
2715 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2717 sys_warn (r, record->pos + ofs,
2718 _("Ignoring long string missing value %zu for variable "
2719 "%s, with width %d, that has bad value width %zu."),
2720 i, var_get_name (var), var_get_width (var),
2722 ofs += value_length;
2725 var_set_missing_values (var, &mv);
2730 parse_var_sets (struct sfm_reader *r,
2731 const struct sfm_extension_record *record,
2732 struct dictionary *dict)
2734 struct text_record *text = open_text_record (r, record, true);
2737 char *varset_name = text_get_token (text, ss_cstr ("="), NULL);
2741 struct varset *varset = xmalloc (sizeof *varset);
2742 *varset = (struct varset) {
2743 .name = xstrdup (varset_name),
2746 text_match (text, ' ');
2748 size_t allocated_vars = 0;
2752 char *var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
2756 size_t len = strlen (var_name);
2757 if (len > 0 && var_name[len - 1] == '\r')
2758 var_name[len - 1] = '\0';
2760 struct variable *var = dict_lookup_var (dict, var_name);
2763 if (varset->n_vars >= allocated_vars)
2764 varset->vars = x2nrealloc (varset->vars, &allocated_vars,
2765 sizeof *varset->vars);
2766 varset->vars[varset->n_vars++] = var;
2769 sys_warn (r, record->pos,
2770 _("Variable set %s contains unknown variable %s."),
2771 varset_name, var_name);
2773 while (delimiter == ' ');
2775 dict_add_varset (dict, varset);
2777 close_text_record (r, text);
2782 static void partial_record (struct sfm_reader *);
2784 static void read_error (struct casereader *, const struct sfm_reader *);
2786 static bool read_case_number (struct sfm_reader *, double *);
2787 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2788 static int read_opcode (struct sfm_reader *);
2789 static bool read_compressed_number (struct sfm_reader *, double *);
2790 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2791 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2792 static bool skip_whole_strings (struct sfm_reader *, size_t);
2794 /* Reads and returns one case from READER's file. Returns a null
2795 pointer if not successful. */
2796 static struct ccase *
2797 sys_file_casereader_read (struct casereader *reader, void *r_)
2799 struct sfm_reader *r = r_;
2804 if (r->error || !r->sfm_n_vars)
2807 c = case_create (r->proto);
2809 for (i = 0; i < r->sfm_n_vars; i++)
2811 struct sfm_var *sv = &r->sfm_vars[i];
2812 union value *v = case_data_rw_idx (c, sv->case_index);
2814 if (sv->var_width == 0)
2815 retval = read_case_number (r, &v->f);
2818 retval = read_case_string (r, v->s + sv->offset, sv->segment_width);
2821 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2823 sys_error (r, r->pos, _("File ends in partial string value."));
2835 if (r->n_cases != -1)
2836 read_error (reader, r);
2841 /* Issues an error that R ends in a partial record. */
2843 partial_record (struct sfm_reader *r)
2845 sys_error (r, r->pos, _("File ends in partial case."));
2848 /* Issues an error that an unspecified error occurred SFM, and
2851 read_error (struct casereader *r, const struct sfm_reader *sfm)
2853 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2854 casereader_force_error (r);
2857 /* Reads a number from R and stores its value in *D.
2858 If R is compressed, reads a compressed number;
2859 otherwise, reads a number in the regular way.
2860 Returns true if successful, false if end of file is
2861 reached immediately. */
2863 read_case_number (struct sfm_reader *r, double *d)
2865 if (r->compression == ANY_COMP_NONE)
2868 if (!try_read_bytes (r, number, sizeof number))
2870 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2874 return read_compressed_number (r, d);
2877 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2878 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2879 discarded without being written to S. Reads compressed strings if S is
2880 compressed. Returns 1 if successful, 0 if end of file is reached
2881 immediately, or -1 for some kind of error. */
2883 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2885 size_t whole = ROUND_DOWN (length, 8);
2886 size_t partial = length % 8;
2890 int retval = read_whole_strings (r, s, whole);
2898 int retval = read_whole_strings (r, bounce, sizeof bounce);
2910 memcpy (s + whole, bounce, partial);
2916 /* Reads and returns the next compression opcode from R. */
2918 read_opcode (struct sfm_reader *r)
2920 assert (r->compression != ANY_COMP_NONE);
2924 if (r->opcode_idx >= sizeof r->opcodes)
2927 int retval = try_read_compressed_bytes (r, r->opcodes,
2933 opcode = r->opcodes[r->opcode_idx++];
2940 /* Reads a compressed number from R and stores its value in D.
2941 Returns true if successful, false if end of file is
2942 reached immediately. */
2944 read_compressed_number (struct sfm_reader *r, double *d)
2946 int opcode = read_opcode (r);
2954 return read_compressed_float (r, d);
2957 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2958 if (!r->corruption_warning)
2960 r->corruption_warning = true;
2961 sys_warn (r, r->pos,
2962 _("Possible compressed data corruption: "
2963 "compressed spaces appear in numeric field."));
2972 *d = opcode - r->bias;
2979 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2981 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2986 opcode = read_opcode (r);
2994 retval = read_compressed_bytes (r, dst, 8);
2995 return retval == 1 ? 1 : -1;
2998 memset (dst, ' ', 8);
3003 double value = opcode - r->bias;
3004 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
3007 /* This has actually been seen "in the wild". The submitter of the
3008 file that showed that the contents decoded as spaces, but they
3009 were at the end of the field so it's possible that the null
3010 bytes just acted as null terminators. */
3012 else if (!r->corruption_warning)
3014 r->corruption_warning = true;
3015 sys_warn (r, r->pos,
3016 _("Possible compressed data corruption: "
3017 "string contains compressed integer (opcode %d)."),
3025 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
3026 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
3027 end of file is reached immediately, or -1 for some kind of error. */
3029 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
3031 assert (length % 8 == 0);
3032 if (r->compression == ANY_COMP_NONE)
3033 return try_read_bytes (r, s, length);
3038 for (ofs = 0; ofs < length; ofs += 8)
3040 int retval = read_compressed_string (r, s + ofs);
3055 /* Skips LENGTH string bytes from R.
3056 LENGTH must be a multiple of 8.
3057 (LENGTH is also limited to 1024, but that's only because the
3058 current caller never needs more than that many bytes.)
3059 Returns true if successful, false if end of file is
3060 reached immediately. */
3062 skip_whole_strings (struct sfm_reader *r, size_t length)
3064 uint8_t buffer[1024];
3065 assert (length < sizeof buffer);
3066 return read_whole_strings (r, buffer, length);
3069 /* Helpers for reading records that contain structured text
3072 /* Maximum number of warnings to issue for a single text
3074 #define MAX_TEXT_WARNINGS 5
3079 struct substring buffer; /* Record contents. */
3080 off_t start; /* Starting offset in file. */
3081 size_t pos; /* Current position in buffer. */
3082 int n_warnings; /* Number of warnings issued or suppressed. */
3083 bool recoded; /* Recoded into UTF-8? */
3086 static struct text_record *
3087 open_text_record (struct sfm_reader *r,
3088 const struct sfm_extension_record *record,
3089 bool recode_to_utf8)
3091 struct text_record *text;
3092 struct substring raw;
3094 text = pool_alloc (r->pool, sizeof *text);
3095 raw = ss_buffer (record->data, record->size * record->count);
3096 text->start = record->pos;
3097 text->buffer = (recode_to_utf8
3098 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
3101 text->n_warnings = 0;
3102 text->recoded = recode_to_utf8;
3107 /* Closes TEXT, frees its storage, and issues a final warning
3108 about suppressed warnings if necessary. */
3110 close_text_record (struct sfm_reader *r, struct text_record *text)
3112 if (text->n_warnings > MAX_TEXT_WARNINGS)
3113 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
3114 text->n_warnings - MAX_TEXT_WARNINGS);
3116 pool_free (r->pool, ss_data (text->buffer));
3119 /* Reads a variable=value pair from TEXT.
3120 Looks up the variable in DICT and stores it into *VAR.
3121 Stores a null-terminated value into *VALUE. */
3123 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
3124 struct text_record *text,
3125 struct variable **var, char **value)
3129 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
3132 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
3136 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
3137 ss_buffer ("\t\0", 2));
3145 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
3146 struct text_record *text, struct substring delimiters,
3147 struct variable **var)
3151 name = text_get_token (text, delimiters, NULL);
3155 *var = dict_lookup_var (dict, name);
3159 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3166 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3167 struct text_record *text, struct substring delimiters,
3168 struct variable **var)
3170 char *short_name = text_get_token (text, delimiters, NULL);
3171 if (short_name == NULL)
3174 *var = dict_lookup_var (dict, short_name);
3176 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3181 /* Displays a warning for the current file position, limiting the
3182 number to MAX_TEXT_WARNINGS for TEXT. */
3184 text_warn (struct sfm_reader *r, struct text_record *text,
3185 const char *format, ...)
3187 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3191 va_start (args, format);
3192 sys_msg (r, text->start + text->pos, MW, format, args);
3198 text_get_token (struct text_record *text, struct substring delimiters,
3201 struct substring token;
3204 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3206 if (delimiter != NULL)
3207 *delimiter = ss_data (text->buffer)[text->pos-1];
3211 end = &ss_data (token)[ss_length (token)];
3212 if (delimiter != NULL)
3215 return ss_data (token);
3218 /* Reads a integer value expressed in decimal, then a space, then a string that
3219 consists of exactly as many bytes as specified by the integer, then a space,
3220 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3221 buffer (so the caller should not free the string). */
3223 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3231 while (text->pos < text->buffer.length)
3233 int c = text->buffer.string[text->pos];
3234 if (c < '0' || c > '9')
3236 n = (n * 10) + (c - '0');
3239 if (text->pos >= text->buffer.length || start == text->pos)
3241 sys_warn (r, text->start,
3242 _("Expecting digit at offset %zu in MRSETS record."),
3247 if (!text_match (text, ' '))
3249 sys_warn (r, text->start,
3250 _("Expecting space at offset %zu in MRSETS record."),
3255 if (text->pos + n > text->buffer.length)
3257 sys_warn (r, text->start,
3258 _("%zu-byte string starting at offset %zu "
3259 "exceeds record length %zu."),
3260 n, text->pos, text->buffer.length);
3264 s = &text->buffer.string[text->pos];
3267 sys_warn (r, text->start,
3268 _("Expecting space at offset %zu following %zu-byte string."),
3278 text_match (struct text_record *text, char c)
3280 if (text->pos >= text->buffer.length)
3283 if (text->buffer.string[text->pos] == c)
3292 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3293 inside the TEXT's string. */
3295 text_pos (const struct text_record *text)
3301 text_get_all (const struct text_record *text)
3303 return text->buffer.string;
3308 /* Displays a corruption message. */
3310 sys_msg (struct sfm_reader *r, off_t offset,
3311 int class, const char *format, va_list args)
3315 ds_init_empty (&text);
3317 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3318 fh_get_file_name (r->fh), (long long int) offset);
3320 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3321 ds_put_vformat (&text, format, args);
3323 struct msg *m = xmalloc (sizeof *m);
3325 .category = msg_class_to_category (class),
3326 .severity = msg_class_to_severity (class),
3327 .text = ds_steal_cstr (&text),
3332 /* Displays a warning for offset OFFSET in the file. */
3334 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3338 va_start (args, format);
3339 sys_msg (r, offset, MW, format, args);
3343 /* Displays an error for the current file position and marks it as in an error
3346 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3350 va_start (args, format);
3351 sys_msg (r, offset, ME, format, args);
3357 /* Reads BYTE_CNT bytes into BUF.
3358 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3359 Returns -1 if an I/O error or a partial read occurs.
3360 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3363 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3364 void *buf, size_t n_bytes)
3366 size_t bytes_read = fread (buf, 1, n_bytes, r->file);
3367 r->pos += bytes_read;
3368 if (bytes_read == n_bytes)
3370 else if (ferror (r->file))
3372 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3375 else if (!eof_is_ok || bytes_read != 0)
3377 sys_error (r, r->pos, _("Unexpected end of file."));
3384 /* Reads BYTE_CNT into BUF.
3385 Returns true if successful.
3386 Returns false upon I/O error or if end-of-file is encountered. */
3388 read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3390 return read_bytes_internal (r, false, buf, n_bytes) == 1;
3393 /* Reads BYTE_CNT bytes into BUF.
3394 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3395 Returns 0 if an immediate end-of-file is encountered.
3396 Returns -1 if an I/O error or a partial read occurs. */
3398 try_read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3400 return read_bytes_internal (r, true, buf, n_bytes);
3403 /* Reads a 32-bit signed integer from R and stores its value in host format in
3404 *X. Returns true if successful, otherwise false. */
3406 read_int (struct sfm_reader *r, int *x)
3409 if (read_bytes (r, integer, sizeof integer) != 1)
3411 *x = integer_get (r->integer_format, integer, sizeof integer);
3416 read_uint (struct sfm_reader *r, unsigned int *x)
3421 ok = read_int (r, &y);
3426 /* Reads a 64-bit signed integer from R and returns its value in
3429 read_int64 (struct sfm_reader *r, long long int *x)
3432 if (read_bytes (r, integer, sizeof integer) != 1)
3434 *x = integer_get (r->integer_format, integer, sizeof integer);
3438 /* Reads a 64-bit signed integer from R and returns its value in
3441 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3446 ok = read_int64 (r, &y);
3452 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3454 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3458 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3460 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3463 /* Reads exactly SIZE - 1 bytes into BUFFER
3464 and stores a null byte into BUFFER[SIZE - 1]. */
3466 read_string (struct sfm_reader *r, char *buffer, size_t size)
3471 ok = read_bytes (r, buffer, size - 1);
3473 buffer[size - 1] = '\0';
3477 /* Skips BYTES bytes forward in R. */
3479 skip_bytes (struct sfm_reader *r, size_t bytes)
3484 size_t chunk = MIN (sizeof buffer, bytes);
3485 if (!read_bytes (r, buffer, chunk))
3493 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3494 been replaced by LFs.
3496 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3497 files that use CR-only line ends in the file label and extra product
3500 fix_line_ends (const char *s)
3504 d = dst = xmalloc (strlen (s) + 1);
3523 read_ztrailer (struct sfm_reader *r,
3524 long long int zheader_ofs,
3525 long long int ztrailer_len);
3528 zalloc (voidpf pool_, uInt items, uInt size)
3530 struct pool *pool = pool_;
3532 return (!size || xalloc_oversized (items, size)
3534 : pool_malloc (pool, items * size));
3538 zfree (voidpf pool_, voidpf address)
3540 struct pool *pool = pool_;
3542 pool_free (pool, address);
3546 read_zheader (struct sfm_reader *r)
3549 long long int zheader_ofs;
3550 long long int ztrailer_ofs;
3551 long long int ztrailer_len;
3553 if (!read_int64 (r, &zheader_ofs)
3554 || !read_int64 (r, &ztrailer_ofs)
3555 || !read_int64 (r, &ztrailer_len))
3558 if (zheader_ofs != pos)
3560 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3561 "(expected %#llx)."),
3562 zheader_ofs, (long long int) pos);
3566 if (ztrailer_ofs < r->pos)
3568 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3573 if (ztrailer_len < 24 || ztrailer_len % 24)
3575 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3579 r->ztrailer_ofs = ztrailer_ofs;
3580 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3583 if (r->zin_buf == NULL)
3585 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3586 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3587 r->zstream.next_in = NULL;
3588 r->zstream.avail_in = 0;
3591 r->zstream.zalloc = zalloc;
3592 r->zstream.zfree = zfree;
3593 r->zstream.opaque = r->pool;
3595 return open_zstream (r);
3599 seek (struct sfm_reader *r, off_t offset)
3601 if (fseeko (r->file, offset, SEEK_SET))
3602 sys_error (r, 0, _("%s: seek failed (%s)."),
3603 fh_get_file_name (r->fh), strerror (errno));
3607 /* Performs some additional consistency checks on the ZLIB compressed data
3610 read_ztrailer (struct sfm_reader *r,
3611 long long int zheader_ofs,
3612 long long int ztrailer_len)
3614 long long int expected_uncmp_ofs;
3615 long long int expected_cmp_ofs;
3618 unsigned int block_size;
3619 unsigned int n_blocks;
3623 if (fstat (fileno (r->file), &s))
3625 sys_error (r, 0, _("%s: stat failed (%s)."),
3626 fh_get_file_name (r->fh), strerror (errno));
3630 if (!S_ISREG (s.st_mode))
3632 /* We can't seek to the trailer and then back to the data in this file,
3633 so skip doing extra checks. */
3637 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3638 sys_warn (r, r->pos,
3639 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3640 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3642 seek (r, r->ztrailer_ofs);
3644 /* Read fixed header from ZLIB data trailer. */
3645 if (!read_int64 (r, &bias))
3647 if (-bias != r->bias)
3649 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3650 "file header bias (%.2f)."),
3655 if (!read_int64 (r, &zero))
3658 sys_warn (r, r->pos,
3659 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3661 if (!read_uint (r, &block_size))
3663 if (block_size != ZBLOCK_SIZE)
3664 sys_warn (r, r->pos,
3665 _("ZLIB trailer specifies unexpected %u-byte block size."),
3668 if (!read_uint (r, &n_blocks))
3670 if (n_blocks != (ztrailer_len - 24) / 24)
3672 sys_error (r, r->pos,
3673 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3675 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3679 expected_uncmp_ofs = zheader_ofs;
3680 expected_cmp_ofs = zheader_ofs + 24;
3681 for (i = 0; i < n_blocks; i++)
3683 off_t desc_ofs = r->pos;
3684 unsigned long long int uncompressed_ofs;
3685 unsigned long long int compressed_ofs;
3686 unsigned int uncompressed_size;
3687 unsigned int compressed_size;
3689 if (!read_uint64 (r, &uncompressed_ofs)
3690 || !read_uint64 (r, &compressed_ofs)
3691 || !read_uint (r, &uncompressed_size)
3692 || !read_uint (r, &compressed_size))
3695 if (uncompressed_ofs != expected_uncmp_ofs)
3697 sys_error (r, desc_ofs,
3698 _("ZLIB block descriptor %u reported uncompressed data "
3699 "offset %#llx, when %#llx was expected."),
3700 i, uncompressed_ofs, expected_uncmp_ofs);
3704 if (compressed_ofs != expected_cmp_ofs)
3706 sys_error (r, desc_ofs,
3707 _("ZLIB block descriptor %u reported compressed data "
3708 "offset %#llx, when %#llx was expected."),
3709 i, compressed_ofs, expected_cmp_ofs);
3713 if (i < n_blocks - 1)
3715 if (uncompressed_size != block_size)
3716 sys_warn (r, desc_ofs,
3717 _("ZLIB block descriptor %u reported block size %#x, "
3718 "when %#x was expected."),
3719 i, uncompressed_size, block_size);
3723 if (uncompressed_size > block_size)
3724 sys_warn (r, desc_ofs,
3725 _("ZLIB block descriptor %u reported block size %#x, "
3726 "when at most %#x was expected."),
3727 i, uncompressed_size, block_size);
3730 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3731 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3732 This code checks for an expansion of more than 14.3% plus 11
3734 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3736 sys_error (r, desc_ofs,
3737 _("ZLIB block descriptor %u reports compressed size %u "
3738 "and uncompressed size %u."),
3739 i, compressed_size, uncompressed_size);
3743 expected_uncmp_ofs += uncompressed_size;
3744 expected_cmp_ofs += compressed_size;
3747 if (expected_cmp_ofs != r->ztrailer_ofs)
3749 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3750 "would be expected from block descriptors."),
3751 r->ztrailer_ofs, expected_cmp_ofs);
3755 seek (r, zheader_ofs + 24);
3760 open_zstream (struct sfm_reader *r)
3764 r->zout_pos = r->zout_end = 0;
3765 error = inflateInit (&r->zstream);
3768 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3776 close_zstream (struct sfm_reader *r)
3780 error = inflateEnd (&r->zstream);
3783 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3791 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t n_bytes)
3793 uint8_t *buf = buf_;
3802 /* Use already inflated data if there is any. */
3803 if (r->zout_pos < r->zout_end)
3805 unsigned int n = MIN (n_bytes, r->zout_end - r->zout_pos);
3806 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3815 /* We need to inflate some more data.
3816 Get some more input data if we don't have any. */
3817 if (r->zstream.avail_in == 0)
3819 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3824 int retval = try_read_bytes (r, r->zin_buf, n);
3827 r->zstream.avail_in = n;
3828 r->zstream.next_in = r->zin_buf;
3832 /* Inflate the (remaining) input data. */
3833 r->zstream.avail_out = ZOUT_BUF_SIZE;
3834 r->zstream.next_out = r->zout_buf;
3835 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3837 r->zout_end = r->zstream.next_out - r->zout_buf;
3838 if (r->zout_end == 0)
3840 if (error != Z_STREAM_END)
3842 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3846 else if (!close_zstream (r) || !open_zstream (r))
3851 /* Process the output data and ignore 'error' for now. ZLIB will
3852 present it to us again on the next inflate() call. */
3858 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3860 if (r->compression == ANY_COMP_SIMPLE)
3861 return read_bytes (r, buf, n_bytes);
3864 int retval = read_bytes_zlib (r, buf, n_bytes);
3866 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3872 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3874 if (r->compression == ANY_COMP_SIMPLE)
3875 return try_read_bytes (r, buf, n_bytes);
3877 return read_bytes_zlib (r, buf, n_bytes);
3880 /* Reads a 64-bit floating-point number from R and returns its
3881 value in host format. */
3883 read_compressed_float (struct sfm_reader *r, double *d)
3887 if (!read_compressed_bytes (r, number, sizeof number))
3890 *d = float_get_double (r->float_format, number);
3894 static const struct casereader_class sys_file_casereader_class =
3896 sys_file_casereader_read,
3897 sys_file_casereader_destroy,
3902 const struct any_reader_class sys_file_reader_class =
3904 N_("SPSS System File"),