1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016, 2021 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-private.h"
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/ll.h"
49 #include "libpspp/message.h"
50 #include "libpspp/misc.h"
51 #include "libpspp/pool.h"
52 #include "libpspp/str.h"
53 #include "libpspp/stringi-set.h"
55 #include "gl/c-strtod.h"
56 #include "gl/c-ctype.h"
57 #include "gl/inttostr.h"
58 #include "gl/localcharset.h"
59 #include "gl/minmax.h"
60 #include "gl/unlocked-io.h"
61 #include "gl/xalloc.h"
62 #include "gl/xalloc-oversized.h"
66 #define _(msgid) gettext (msgid)
67 #define N_(msgid) (msgid)
71 /* subtypes 0-2 unknown */
72 EXT_INTEGER = 3, /* Machine integer info. */
73 EXT_FLOAT = 4, /* Machine floating-point info. */
74 EXT_VAR_SETS = 5, /* Variable sets. */
75 EXT_DATE = 6, /* DATE. */
76 EXT_MRSETS = 7, /* Multiple response sets. */
77 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
78 /* subtype 9 unknown */
79 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
80 EXT_DISPLAY = 11, /* Variable display parameters. */
81 /* subtype 12 unknown */
82 EXT_LONG_NAMES = 13, /* Long variable names. */
83 EXT_LONG_STRINGS = 14, /* Long strings. */
84 /* subtype 15 unknown */
85 EXT_NCASES = 16, /* Extended number of cases. */
86 EXT_FILE_ATTRS = 17, /* Data file attributes. */
87 EXT_VAR_ATTRS = 18, /* Variable attributes. */
88 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
89 EXT_ENCODING = 20, /* Character encoding. */
90 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
91 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
92 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
95 /* Fields from the top-level header record. */
96 struct sfm_header_record
98 char magic[5]; /* First 4 bytes of file, then null. */
99 int weight_idx; /* 0 if unweighted, otherwise a var index. */
100 int nominal_case_size; /* Number of var positions. */
102 /* These correspond to the members of struct any_file_info or a dictionary
103 but in the system file's encoding rather than ASCII. */
104 char creation_date[10]; /* "dd mmm yy". */
105 char creation_time[9]; /* "hh:mm:ss". */
106 char eye_catcher[61]; /* Eye-catcher string, then product name. */
107 char file_label[65]; /* File label. */
110 struct sfm_var_record
117 int missing_value_code;
120 struct variable *var;
123 struct sfm_value_label
129 struct sfm_value_label_record
132 struct sfm_value_label *labels;
133 unsigned int n_labels;
139 struct sfm_document_record
148 const char *name; /* Name. */
149 const char *label; /* Human-readable label for group. */
150 enum mrset_type type; /* Group type. */
151 const char **vars; /* Constituent variables' names. */
152 size_t n_vars; /* Number of constituent variables. */
155 enum mrset_md_cat_source cat_source; /* Source of category labels. */
156 bool label_from_var_label; /* 'label' taken from variable label? */
157 const char *counted; /* Counted value, as string. */
160 struct sfm_extension_record
162 struct ll ll; /* In struct sfm_reader 'var_attrs' list. */
163 int subtype; /* Record subtype. */
164 off_t pos; /* Starting offset in file. */
165 unsigned int size; /* Size of data elements. */
166 unsigned int count; /* Number of data elements. */
167 void *data; /* Contents. */
170 /* System file reader. */
173 struct any_reader any_reader;
175 /* Resource tracking. */
176 struct pool *pool; /* All system file state. */
179 struct any_read_info info;
180 struct sfm_header_record header;
181 struct sfm_var_record *vars;
183 struct sfm_value_label_record *labels;
185 struct sfm_document_record *document;
186 struct sfm_mrset *mrsets;
188 struct sfm_extension_record *extensions[32];
189 struct ll_list var_attrs; /* Contains "struct sfm_extension_record"s. */
192 struct file_handle *fh; /* File handle. */
193 struct fh_lock *lock; /* Mutual exclusion for file handle. */
194 FILE *file; /* File stream. */
195 off_t pos; /* Position in file. */
196 bool error; /* I/O or corruption error? */
197 struct caseproto *proto; /* Format of output cases. */
200 enum integer_format integer_format; /* On-disk integer format. */
201 enum float_format float_format; /* On-disk floating point format. */
202 struct sfm_var *sfm_vars; /* Variables. */
203 size_t sfm_var_cnt; /* Number of variables. */
204 int case_cnt; /* Number of cases */
205 const char *encoding; /* String encoding. */
206 bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */
209 enum any_compression compression;
210 double bias; /* Compression bias, usually 100.0. */
211 uint8_t opcodes[8]; /* Current block of opcodes. */
212 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
213 bool corruption_warning; /* Warned about possible corruption? */
215 /* ZLIB decompression. */
216 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
217 #define ZIN_BUF_SIZE 4096
218 uint8_t *zin_buf; /* Inflation input buffer. */
219 #define ZOUT_BUF_SIZE 16384
220 uint8_t *zout_buf; /* Inflation output buffer. */
221 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
222 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
223 z_stream zstream; /* ZLIB inflater. */
226 static const struct casereader_class sys_file_casereader_class;
228 static struct sfm_reader *
229 sfm_reader_cast (const struct any_reader *r_)
231 assert (r_->klass == &sys_file_reader_class);
232 return UP_CAST (r_, struct sfm_reader, any_reader);
235 static bool sfm_close (struct any_reader *);
237 static void sys_msg (struct sfm_reader *r, off_t, int class,
238 const char *format, va_list args)
239 PRINTF_FORMAT (4, 0);
240 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
241 PRINTF_FORMAT (3, 4);
242 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
243 PRINTF_FORMAT (3, 4);
245 static bool read_bytes (struct sfm_reader *, void *, size_t)
247 static int try_read_bytes (struct sfm_reader *, void *, size_t)
249 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
250 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
251 static bool read_int64 (struct sfm_reader *, long long int *)
253 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
255 static bool read_string (struct sfm_reader *, char *, size_t)
257 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
259 /* ZLIB compressed data handling. */
260 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
261 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
262 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
263 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
265 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
267 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
269 static bool read_compressed_float (struct sfm_reader *, double *)
272 static char *fix_line_ends (const char *);
274 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
275 static double parse_float (const struct sfm_reader *,
276 const void *data, size_t ofs);
278 static bool read_variable_record (struct sfm_reader *,
279 struct sfm_var_record *);
280 static bool read_value_label_record (struct sfm_reader *,
281 struct sfm_value_label_record *);
282 static bool read_document_record (struct sfm_reader *);
283 static bool read_extension_record (struct sfm_reader *, int subtype,
284 struct sfm_extension_record **);
285 static bool skip_extension_record (struct sfm_reader *, int subtype);
287 static struct text_record *open_text_record (
288 struct sfm_reader *, const struct sfm_extension_record *,
289 bool recode_to_utf8);
290 static void close_text_record (struct sfm_reader *,
291 struct text_record *);
292 static bool read_variable_to_value_pair (struct sfm_reader *,
294 struct text_record *,
295 struct variable **var, char **value);
296 static void text_warn (struct sfm_reader *r, struct text_record *text,
297 const char *format, ...) PRINTF_FORMAT (3, 4);
298 static char *text_get_token (struct text_record *,
299 struct substring delimiters, char *delimiter);
300 static bool text_match (struct text_record *, char c);
301 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
302 struct text_record *,
303 struct substring delimiters,
305 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
306 struct text_record *,
307 struct substring delimiters,
309 static const char *text_parse_counted_string (struct sfm_reader *,
310 struct text_record *);
311 static size_t text_pos (const struct text_record *);
312 static const char *text_get_all (const struct text_record *);
314 /* Dictionary reader. */
322 static bool read_dictionary (struct sfm_reader *);
323 static bool read_record (struct sfm_reader *, int type,
324 size_t *allocated_vars, size_t *allocated_labels);
325 static bool read_header (struct sfm_reader *, struct any_read_info *,
326 struct sfm_header_record *);
327 static void parse_header (struct sfm_reader *,
328 const struct sfm_header_record *,
329 struct any_read_info *, struct dictionary *);
330 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
331 struct sfm_var_record *, size_t n);
332 static void parse_format_spec (struct sfm_reader *, off_t pos,
333 unsigned int format, enum which_format,
334 struct variable *, int *format_warning_cnt);
335 static void parse_document (struct dictionary *, struct sfm_document_record *);
336 static void parse_display_parameters (struct sfm_reader *,
337 const struct sfm_extension_record *,
338 struct dictionary *);
339 static bool parse_machine_integer_info (struct sfm_reader *,
340 const struct sfm_extension_record *,
341 struct any_read_info *);
342 static void parse_machine_float_info (struct sfm_reader *,
343 const struct sfm_extension_record *);
344 static void parse_extra_product_info (struct sfm_reader *,
345 const struct sfm_extension_record *,
346 struct any_read_info *);
347 static void parse_mrsets (struct sfm_reader *,
348 const struct sfm_extension_record *,
349 size_t *allocated_mrsets);
350 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
351 static void parse_long_var_name_map (struct sfm_reader *,
352 const struct sfm_extension_record *,
353 struct dictionary *);
354 static bool parse_long_string_map (struct sfm_reader *,
355 const struct sfm_extension_record *,
356 struct dictionary *);
357 static void parse_value_labels (struct sfm_reader *, struct dictionary *);
358 static struct variable *parse_weight_var (struct sfm_reader *,
359 const struct sfm_var_record *, size_t n_var_recs,
361 static void parse_data_file_attributes (struct sfm_reader *,
362 const struct sfm_extension_record *,
363 struct dictionary *);
364 static void parse_variable_attributes (struct sfm_reader *,
365 const struct sfm_extension_record *,
366 struct dictionary *);
367 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
368 static void parse_long_string_value_labels (struct sfm_reader *,
369 const struct sfm_extension_record *,
370 struct dictionary *);
371 static void parse_long_string_missing_values (
372 struct sfm_reader *, const struct sfm_extension_record *,
373 struct dictionary *);
375 /* Frees the strings inside INFO. */
377 any_read_info_destroy (struct any_read_info *info)
381 free (info->creation_date);
382 free (info->creation_time);
383 free (info->product);
384 free (info->product_ext);
388 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
389 successful, otherwise NULL. */
390 static struct any_reader *
391 sfm_open (struct file_handle *fh)
393 size_t allocated_mrsets = 0;
394 struct sfm_reader *r;
396 /* Create and initialize reader. */
397 r = xzalloc (sizeof *r);
398 r->any_reader.klass = &sys_file_reader_class;
399 r->pool = pool_create ();
400 pool_register (r->pool, free, r);
402 r->opcode_idx = sizeof r->opcodes;
403 ll_init (&r->var_attrs);
405 /* TRANSLATORS: this fragment will be interpolated into
406 messages in fh_lock() that identify types of files. */
407 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
411 r->file = fn_open (fh, "rb");
414 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
415 fh_get_file_name (r->fh), strerror (errno));
419 if (!read_dictionary (r))
422 if (r->extensions[EXT_MRSETS] != NULL)
423 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
425 if (r->extensions[EXT_MRSETS2] != NULL)
426 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
428 return &r->any_reader;
432 sfm_close (&r->any_reader);
437 read_dictionary (struct sfm_reader *r)
439 size_t allocated_vars;
440 size_t allocated_labels;
442 if (!read_header (r, &r->info, &r->header))
446 allocated_labels = 0;
451 if (!read_int (r, &type))
455 if (!read_record (r, type, &allocated_vars, &allocated_labels))
459 if (!skip_bytes (r, 4))
462 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
469 read_record (struct sfm_reader *r, int type,
470 size_t *allocated_vars, size_t *allocated_labels)
477 if (r->n_vars >= *allocated_vars)
478 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
480 return read_variable_record (r, &r->vars[r->n_vars++]);
483 if (r->n_labels >= *allocated_labels)
484 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
486 return read_value_label_record (r, &r->labels[r->n_labels++]);
489 /* A Type 4 record is always immediately after a type 3 record,
490 so the code for type 3 records reads the type 4 record too. */
491 sys_error (r, r->pos, _("Misplaced type 4 record."));
495 if (r->document != NULL)
496 sys_warn (r, r->pos, _("Duplicate type 6 (document) record."));
497 return read_document_record (r);
500 if (!read_int (r, &subtype))
503 || subtype >= sizeof r->extensions / sizeof *r->extensions)
506 _("Unrecognized record type 7, subtype %d. For help, "
507 "please send this file to %s and mention that you were "
509 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
510 return skip_extension_record (r, subtype);
512 else if (subtype == 18)
514 /* System files written by "Stata 14.1/-savespss- 1.77 by S.Radyakin"
515 put each variable attribute into a separate record with subtype
516 18. I'm surprised that SPSS puts up with this. */
517 struct sfm_extension_record *ext;
518 bool ok = read_extension_record (r, subtype, &ext);
520 ll_push_tail (&r->var_attrs, &ext->ll);
523 else if (r->extensions[subtype] != NULL)
526 _("Record type 7, subtype %d found here has the same "
527 "type as the record found near offset 0x%llx. For "
528 "help, please send this file to %s and mention that "
529 "you were using %s."),
530 subtype, (long long int) r->extensions[subtype]->pos,
531 PACKAGE_BUGREPORT, PACKAGE_STRING);
532 return skip_extension_record (r, subtype);
535 return read_extension_record (r, subtype, &r->extensions[subtype]);
538 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
545 /* Returns the character encoding obtained from R, or a null pointer if R
546 doesn't have an indication of its character encoding. */
548 sfm_get_encoding (const struct sfm_reader *r)
550 /* The EXT_ENCODING record is the best way to determine dictionary
552 if (r->extensions[EXT_ENCODING])
553 return r->extensions[EXT_ENCODING]->data;
555 /* But EXT_INTEGER is better than nothing as a fallback. */
556 if (r->extensions[EXT_INTEGER])
558 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
559 const char *encoding;
568 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
569 respectively. However, many files have character code 2 but data
570 which are clearly not ASCII. Therefore, ignore these values. */
577 encoding = sys_get_encoding_from_codepage (codepage);
578 if (encoding != NULL)
584 /* If the file magic number is EBCDIC then its character data is too. */
585 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
591 struct get_strings_aux
602 add_string__ (struct get_strings_aux *aux,
603 const char *string, bool id, char *title)
605 if (aux->n >= aux->allocated)
607 aux->allocated = 2 * (aux->allocated + 1);
608 aux->titles = pool_realloc (aux->pool, aux->titles,
609 aux->allocated * sizeof *aux->titles);
610 aux->strings = pool_realloc (aux->pool, aux->strings,
611 aux->allocated * sizeof *aux->strings);
612 aux->ids = pool_realloc (aux->pool, aux->ids,
613 aux->allocated * sizeof *aux->ids);
616 aux->titles[aux->n] = title;
617 aux->strings[aux->n] = pool_strdup (aux->pool, string);
618 aux->ids[aux->n] = id;
622 static void PRINTF_FORMAT (3, 4)
623 add_string (struct get_strings_aux *aux,
624 const char *string, const char *title, ...)
628 va_start (args, title);
629 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
633 static void PRINTF_FORMAT (3, 4)
634 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
638 va_start (args, title);
639 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
644 skip_prefix (const char *s, const char *prefix)
646 size_t prefix_len = strlen (prefix);
647 return !strncmp (s, prefix, prefix_len) ? s + prefix_len : s;
650 /* Retrieves significant string data from R in its raw format, to allow the
651 caller to try to detect the encoding in use.
653 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
654 and *STRINGSP to an array of N elements allocated from POOL. For each I in
655 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
656 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
657 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
660 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
661 char ***titlesp, bool **idsp, char ***stringsp)
663 struct sfm_reader *r = sfm_reader_cast (r_);
664 const struct sfm_mrset *mrset;
665 struct get_strings_aux aux;
677 for (i = 0; i < r->n_vars; i++)
678 if (r->vars[i].width != -1)
679 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
682 for (i = 0; i < r->n_vars; i++)
683 if (r->vars[i].width != -1)
686 if (r->vars[i].label)
687 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
692 for (i = 0; i < r->n_labels; i++)
693 for (j = 0; j < r->labels[i].n_labels; j++)
694 add_string (&aux, r->labels[i].labels[j].label,
695 _("Value Label %zu"), k++);
697 add_string (&aux, r->header.creation_date, _("Creation Date"));
698 add_string (&aux, r->header.creation_time, _("Creation Time"));
699 add_string (&aux, skip_prefix (r->header.eye_catcher, "@(#) "), _("Product"));
700 add_string (&aux, r->header.file_label, _("File Label"));
702 if (r->extensions[EXT_PRODUCT_INFO])
703 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
704 _("Extra Product Info"));
710 for (i = 0; i < r->document->n_lines; i++)
714 memcpy (line, r->document->documents + i * 80, 80);
717 add_string (&aux, line, _("Document Line %zu"), i + 1);
721 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
723 size_t mrset_idx = mrset - r->mrsets + 1;
725 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
727 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
729 /* Skip the variables because they ought to be duplicates. */
732 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
736 /* data file attributes */
737 /* variable attributes */
739 /* long string value labels */
740 /* long string missing values */
742 *titlesp = aux.titles;
744 *stringsp = aux.strings;
748 /* Decodes the dictionary read from R, saving it into into *DICT. Character
749 strings in R are decoded using ENCODING, or an encoding obtained from R if
750 ENCODING is null, or the locale encoding if R specifies no encoding.
752 If INFOP is non-null, then it receives additional info about the system
753 file, which the caller must eventually free with any_read_info_destroy()
754 when it is no longer needed.
756 This function consumes R. The caller must use it again later, even to
757 destroy it with sfm_close(). */
758 static struct casereader *
759 sfm_decode (struct any_reader *r_, const char *encoding,
760 struct dictionary **dictp, struct any_read_info *infop)
762 struct sfm_reader *r = sfm_reader_cast (r_);
763 struct dictionary *dict;
765 if (encoding == NULL)
767 encoding = sfm_get_encoding (r);
768 if (encoding == NULL)
770 sys_warn (r, -1, _("This system file does not indicate its own "
771 "character encoding. Using default encoding "
772 "%s. For best results, specify an encoding "
773 "explicitly. Use SYSFILE INFO with "
774 "ENCODING=\"DETECT\" to analyze the possible "
777 encoding = locale_charset ();
781 dict = dict_create (encoding);
782 r->encoding = dict_get_encoding (dict);
784 /* These records don't use variables at all. */
785 if (r->document != NULL)
786 parse_document (dict, r->document);
788 if (r->extensions[EXT_INTEGER] != NULL
789 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
792 if (r->extensions[EXT_FLOAT] != NULL)
793 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
795 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
796 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
798 if (r->extensions[EXT_FILE_ATTRS] != NULL)
799 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
801 parse_header (r, &r->header, &r->info, dict);
803 /* Parse the variable records, the basis of almost everything else. */
804 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
807 /* Parse value labels and the weight variable immediately after the variable
808 records. These records use indexes into var_recs[], so we must parse them
809 before those indexes become invalidated by very long string variables. */
810 parse_value_labels (r, dict);
811 if (r->header.weight_idx != 0)
812 dict_set_weight (dict, parse_weight_var (r, r->vars, r->n_vars,
813 r->header.weight_idx));
815 if (r->extensions[EXT_DISPLAY] != NULL)
816 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
818 /* The following records use short names, so they need to be parsed before
819 parse_long_var_name_map() changes short names to long names. */
820 decode_mrsets (r, dict);
822 if (r->extensions[EXT_LONG_STRINGS] != NULL
823 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
826 /* Now rename variables to their long names. */
827 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
829 /* The following records use long names, so they need to follow renaming. */
830 if (!ll_is_empty (&r->var_attrs))
832 struct sfm_extension_record *ext;
833 ll_for_each (ext, struct sfm_extension_record, ll, &r->var_attrs)
834 parse_variable_attributes (r, ext, dict);
836 /* Roles use the $@Role attribute. */
837 assign_variable_roles (r, dict);
839 if (r->extensions[EXT_LONG_LABELS] != NULL)
840 parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
841 if (r->extensions[EXT_LONG_MISSING] != NULL)
842 parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
845 /* Warn if the actual amount of data per case differs from the
846 amount that the header claims. SPSS version 13 gets this
847 wrong when very long strings are involved, so don't warn in
849 if (r->header.nominal_case_size > 0
850 && r->header.nominal_case_size != r->n_vars
851 && r->info.version_major != 13)
852 sys_warn (r, -1, _("File header claims %d variable positions but "
853 "%zu were read from file."),
854 r->header.nominal_case_size, r->n_vars);
856 /* Create an index of dictionary variable widths for
857 sfm_read_case to use. We cannot use the `struct variable's
858 from the dictionary we created, because the caller owns the
859 dictionary and may destroy or modify its variables. */
860 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
861 pool_register (r->pool, free, r->sfm_vars);
862 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
868 memset (&r->info, 0, sizeof r->info);
871 return casereader_create_sequential
873 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
874 &sys_file_casereader_class, r);
883 /* Closes R, which should have been returned by sfm_open() but not already
884 closed with sfm_decode() or this function.
885 Returns true if an I/O error has occurred on READER, false
888 sfm_close (struct any_reader *r_)
890 struct sfm_reader *r = sfm_reader_cast (r_);
895 if (fn_close (r->fh, r->file) == EOF)
897 msg (ME, _("Error closing system file `%s': %s."),
898 fh_get_file_name (r->fh), strerror (errno));
904 any_read_info_destroy (&r->info);
909 pool_destroy (r->pool);
914 /* Destroys READER. */
916 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
918 struct sfm_reader *r = r_;
919 sfm_close (&r->any_reader);
922 /* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
923 a negative errno value if there is an error reading FILE. */
925 sfm_detect (FILE *file)
929 if (fseek (file, 0, SEEK_SET) != 0)
931 if (fread (magic, 4, 1, file) != 1)
932 return ferror (file) ? -errno : 0;
935 return (!strcmp (ASCII_MAGIC, magic)
936 || !strcmp (ASCII_ZMAGIC, magic)
937 || !strcmp (EBCDIC_MAGIC, magic));
940 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
941 except for the string fields in *INFO, which parse_header() will initialize
942 later once the file's encoding is known. */
944 read_header (struct sfm_reader *r, struct any_read_info *info,
945 struct sfm_header_record *header)
947 uint8_t raw_layout_code[4];
952 if (!read_string (r, header->magic, sizeof header->magic)
953 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
955 r->written_by_readstat = strstr (header->eye_catcher,
956 "https://github.com/WizardMac/ReadStat");
958 if (!strcmp (ASCII_MAGIC, header->magic)
959 || !strcmp (EBCDIC_MAGIC, header->magic))
961 else if (!strcmp (ASCII_ZMAGIC, header->magic))
965 sys_error (r, 0, _("This is not an SPSS system file."));
969 /* Identify integer format. */
970 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
972 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
974 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
976 || (r->integer_format != INTEGER_MSB_FIRST
977 && r->integer_format != INTEGER_LSB_FIRST))
979 sys_error (r, 64, _("This is not an SPSS system file."));
983 if (!read_int (r, &header->nominal_case_size))
986 if (header->nominal_case_size < 0
987 || header->nominal_case_size > INT_MAX / 16)
988 header->nominal_case_size = -1;
990 if (!read_int (r, &compressed))
995 r->compression = ANY_COMP_NONE;
996 else if (compressed == 1)
997 r->compression = ANY_COMP_SIMPLE;
1000 sys_error (r, 0, "System file header has invalid compression "
1001 "value %d.", compressed);
1007 if (compressed == 2)
1008 r->compression = ANY_COMP_ZLIB;
1011 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1012 "compression value %d.", compressed);
1017 if (!read_int (r, &header->weight_idx))
1020 if (!read_int (r, &r->case_cnt))
1022 if (r->case_cnt > INT_MAX / 2)
1025 /* Identify floating-point format and obtain compression bias. */
1026 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1028 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1030 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1032 if (memcmp (raw_bias, zero_bias, 8))
1033 sys_warn (r, r->pos - 8,
1034 _("Compression bias is not the usual "
1035 "value of 100, or system file uses unrecognized "
1036 "floating-point format."));
1039 /* Some software is known to write all-zeros to this
1040 field. Such software also writes floating-point
1041 numbers in the format that we expect by default
1042 (it seems that all software most likely does, in
1043 reality), so don't warn in this case. */
1046 if (r->integer_format == INTEGER_MSB_FIRST)
1047 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1049 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1051 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1053 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1054 || !read_string (r, header->creation_time, sizeof header->creation_time)
1055 || !read_string (r, header->file_label, sizeof header->file_label)
1056 || !skip_bytes (r, 3))
1059 info->integer_format = r->integer_format;
1060 info->float_format = r->float_format;
1061 info->compression = r->compression;
1062 info->case_cnt = r->case_cnt;
1067 /* Reads a variable (type 2) record from R into RECORD. */
1069 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1071 int has_variable_label;
1073 memset (record, 0, sizeof *record);
1075 record->pos = r->pos;
1076 if (!read_int (r, &record->width)
1077 || !read_int (r, &has_variable_label)
1078 || !read_int (r, &record->missing_value_code)
1079 || !read_int (r, &record->print_format)
1080 || !read_int (r, &record->write_format)
1081 || !read_string (r, record->name, sizeof record->name))
1084 if (has_variable_label == 1)
1086 enum { MAX_LABEL_LEN = 65536 };
1087 unsigned int len, read_len;
1089 if (!read_uint (r, &len))
1092 /* Read up to MAX_LABEL_LEN bytes of label. */
1093 read_len = MIN (MAX_LABEL_LEN, len);
1094 record->label = pool_malloc (r->pool, read_len + 1);
1095 if (!read_string (r, record->label, read_len + 1))
1098 /* Skip unread label bytes. */
1099 if (!skip_bytes (r, len - read_len))
1102 /* Skip label padding up to multiple of 4 bytes. */
1103 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1106 else if (has_variable_label != 0)
1108 sys_error (r, record->pos,
1109 _("Variable label indicator field is not 0 or 1."));
1113 /* Set missing values. */
1114 if (record->missing_value_code != 0)
1116 int code = record->missing_value_code;
1117 if (record->width == 0)
1119 if (code < -3 || code > 3 || code == -1)
1121 sys_error (r, record->pos,
1122 _("Numeric missing value indicator field is not "
1123 "-3, -2, 0, 1, 2, or 3."));
1129 if (code < 1 || code > 3)
1131 sys_error (r, record->pos,
1132 _("String missing value indicator field is not "
1138 if (!read_bytes (r, record->missing, 8 * abs (code)))
1145 /* Reads value labels from R into RECORD. */
1147 read_value_label_record (struct sfm_reader *r,
1148 struct sfm_value_label_record *record)
1153 /* Read type 3 record. */
1154 record->pos = r->pos;
1155 if (!read_uint (r, &record->n_labels))
1157 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1159 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1163 record->labels = pool_nmalloc (r->pool, record->n_labels,
1164 sizeof *record->labels);
1165 for (i = 0; i < record->n_labels; i++)
1167 struct sfm_value_label *label = &record->labels[i];
1168 unsigned char label_len;
1171 if (!read_bytes (r, label->value, sizeof label->value))
1174 /* Read label length. */
1175 if (!read_bytes (r, &label_len, sizeof label_len))
1177 padded_len = ROUND_UP (label_len + 1, 8);
1179 /* Read label, padding. */
1180 label->label = pool_malloc (r->pool, padded_len + 1);
1181 if (!read_bytes (r, label->label, padded_len - 1))
1183 label->label[label_len] = '\0';
1186 /* Read record type of type 4 record. */
1187 if (!read_int (r, &type))
1191 sys_error (r, r->pos - 4,
1192 _("Variable index record (type 4) does not immediately "
1193 "follow value label record (type 3) as it should."));
1197 /* Read number of variables associated with value label from type 4
1199 if (!read_uint (r, &record->n_vars))
1201 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1203 sys_error (r, r->pos - 4,
1204 _("Number of variables associated with a value label (%u) "
1205 "is not between 1 and the number of variables (%zu)."),
1206 record->n_vars, r->n_vars);
1210 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1211 for (i = 0; i < record->n_vars; i++)
1212 if (!read_int (r, &record->vars[i]))
1218 /* Reads a document record from R. Returns true if successful, false on
1221 read_document_record (struct sfm_reader *r)
1224 if (!read_int (r, &n_lines))
1226 else if (n_lines == 0)
1228 else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1230 sys_error (r, r->pos,
1231 _("Number of document lines (%d) "
1232 "must be greater than 0 and less than %d."),
1233 n_lines, INT_MAX / DOC_LINE_LENGTH);
1237 struct sfm_document_record *record;
1238 record = pool_malloc (r->pool, sizeof *record);
1239 record->pos = r->pos;
1240 record->n_lines = n_lines;
1241 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1242 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1245 r->document = record;
1250 read_extension_record_header (struct sfm_reader *r, int subtype,
1251 struct sfm_extension_record *record)
1253 record->subtype = subtype;
1254 record->pos = r->pos;
1255 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1258 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1259 allows an extra byte for a null terminator, used by some
1260 extension processing routines. */
1261 if (record->size != 0
1262 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1264 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1272 /* Reads an extension record from R into RECORD. */
1274 read_extension_record (struct sfm_reader *r, int subtype,
1275 struct sfm_extension_record **recordp)
1277 struct extension_record_type
1284 static const struct extension_record_type types[] =
1286 /* Implemented record types. */
1287 { EXT_INTEGER, 4, 8 },
1288 { EXT_FLOAT, 8, 3 },
1289 { EXT_MRSETS, 1, 0 },
1290 { EXT_PRODUCT_INFO, 1, 0 },
1291 { EXT_DISPLAY, 4, 0 },
1292 { EXT_LONG_NAMES, 1, 0 },
1293 { EXT_LONG_STRINGS, 1, 0 },
1294 { EXT_NCASES, 8, 2 },
1295 { EXT_FILE_ATTRS, 1, 0 },
1296 { EXT_VAR_ATTRS, 1, 0 },
1297 { EXT_MRSETS2, 1, 0 },
1298 { EXT_ENCODING, 1, 0 },
1299 { EXT_LONG_LABELS, 1, 0 },
1300 { EXT_LONG_MISSING, 1, 0 },
1302 /* Ignored record types. */
1303 { EXT_VAR_SETS, 0, 0 },
1305 { EXT_DATA_ENTRY, 0, 0 },
1306 { EXT_DATAVIEW, 0, 0 },
1309 const struct extension_record_type *type;
1310 struct sfm_extension_record *record;
1314 record = pool_malloc (r->pool, sizeof *record);
1315 if (!read_extension_record_header (r, subtype, record))
1317 n_bytes = record->count * record->size;
1319 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1320 if (subtype == type->subtype)
1322 if (type->size > 0 && record->size != type->size)
1323 sys_warn (r, record->pos,
1324 _("Record type 7, subtype %d has bad size %u "
1325 "(expected %d)."), subtype, record->size, type->size);
1326 else if (type->count > 0 && record->count != type->count)
1327 sys_warn (r, record->pos,
1328 _("Record type 7, subtype %d has bad count %u "
1329 "(expected %d)."), subtype, record->count, type->count);
1330 else if (type->count == 0 && type->size == 0)
1332 /* Ignore this record. */
1336 char *data = pool_malloc (r->pool, n_bytes + 1);
1337 data[n_bytes] = '\0';
1339 record->data = data;
1340 if (!read_bytes (r, record->data, n_bytes))
1349 sys_warn (r, record->pos,
1350 _("Unrecognized record type 7, subtype %d. For help, please "
1351 "send this file to %s and mention that you were using %s."),
1352 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
1355 return skip_bytes (r, n_bytes);
1359 skip_extension_record (struct sfm_reader *r, int subtype)
1361 struct sfm_extension_record record;
1363 return (read_extension_record_header (r, subtype, &record)
1364 && skip_bytes (r, record.count * record.size));
1368 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1369 struct any_read_info *info, struct dictionary *dict)
1371 const char *dict_encoding = dict_get_encoding (dict);
1372 struct substring product;
1373 struct substring label;
1376 /* Convert file label to UTF-8 and put it into DICT. */
1377 label = recode_substring_pool ("UTF-8", dict_encoding,
1378 ss_cstr (header->file_label), r->pool);
1379 ss_trim (&label, ss_cstr (" "));
1380 label.string[label.length] = '\0';
1381 fixed_label = fix_line_ends (label.string);
1382 dict_set_label (dict, fixed_label);
1385 /* Put creation date and time in UTF-8 into INFO. */
1386 info->creation_date = recode_string ("UTF-8", dict_encoding,
1387 header->creation_date, -1);
1388 info->creation_time = recode_string ("UTF-8", dict_encoding,
1389 header->creation_time, -1);
1391 /* Put product name into INFO, dropping eye-catcher string if present. */
1392 product = recode_substring_pool ("UTF-8", dict_encoding,
1393 ss_cstr (header->eye_catcher), r->pool);
1394 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1395 ss_trim (&product, ss_cstr (" "));
1396 info->product = ss_xstrdup (product);
1399 static struct variable *
1400 add_var_with_generated_name (struct dictionary *dict, int width)
1402 char *name = dict_make_unique_var_name (dict, NULL, NULL);
1403 struct variable *var = dict_create_var_assert (dict, name, width);
1408 /* Reads a variable (type 2) record from R and adds the
1409 corresponding variable to DICT.
1410 Also skips past additional variable records for long string
1413 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1414 struct sfm_var_record *var_recs, size_t n_var_recs)
1416 const char *dict_encoding = dict_get_encoding (dict);
1417 struct sfm_var_record *rec;
1420 for (rec = var_recs; rec < &var_recs[n_var_recs];)
1426 name = recode_string_pool ("UTF-8", dict_encoding,
1427 rec->name, -1, r->pool);
1428 name[strcspn (name, " ")] = '\0';
1430 if (rec->width < 0 || rec->width > 255)
1432 sys_error (r, rec->pos,
1433 _("Bad width %d for variable %s."), rec->width, name);
1437 struct variable *var;
1438 if (!dict_id_is_valid (dict, name, false)
1439 || name[0] == '$' || name[0] == '#')
1441 var = add_var_with_generated_name (dict, rec->width);
1442 sys_warn (r, rec->pos, _("Renaming variable with invalid name "
1443 "`%s' to `%s'."), name, var_get_name (var));
1447 var = dict_create_var (dict, name, rec->width);
1450 var = add_var_with_generated_name (dict, rec->width);
1451 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1453 name, var_get_name (var));
1458 /* Set the short name the same as the long name (even if we renamed
1460 var_set_short_name (var, 0, var_get_name (var));
1462 /* Get variable label, if any. */
1467 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1468 rec->label, -1, r->pool);
1469 var_set_label (var, utf8_label);
1472 /* Set missing values. */
1473 if (rec->missing_value_code != 0)
1475 int width = var_get_width (var);
1476 struct missing_values mv;
1478 mv_init_pool (r->pool, &mv, width);
1479 if (var_is_numeric (var))
1481 bool has_range = rec->missing_value_code < 0;
1482 int n_discrete = (has_range
1483 ? rec->missing_value_code == -3
1484 : rec->missing_value_code);
1489 double low = parse_float (r, rec->missing, 0);
1490 double high = parse_float (r, rec->missing, 8);
1492 /* Deal with SPSS 21 change in representation. */
1496 mv_add_range (&mv, low, high);
1500 for (i = 0; i < n_discrete; i++)
1502 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1507 for (i = 0; i < rec->missing_value_code; i++)
1508 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1509 var_set_missing_values (var, &mv);
1513 parse_format_spec (r, rec->pos + 12, rec->print_format,
1514 PRINT_FORMAT, var, &n_warnings);
1515 parse_format_spec (r, rec->pos + 16, rec->write_format,
1516 WRITE_FORMAT, var, &n_warnings);
1518 /* Account for values.
1519 Skip long string continuation records, if any. */
1520 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1521 for (i = 1; i < n_values; i++)
1522 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1524 sys_error (r, rec->pos, _("Missing string continuation record."));
1533 /* Translates the format spec from sysfile format to internal
1536 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1537 enum which_format which, struct variable *v,
1540 const int max_warnings = 8;
1543 if (fmt_from_u32 (format, var_get_width (v), false, &f))
1545 if (which == PRINT_FORMAT)
1546 var_set_print_format (v, &f);
1548 var_set_write_format (v, &f);
1550 else if (format == 0)
1552 /* Actually observed in the wild. No point in warning about it. */
1554 else if (++*n_warnings <= max_warnings)
1556 if (which == PRINT_FORMAT)
1557 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1559 var_get_name (v), var_get_width (v), format);
1561 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1563 var_get_name (v), var_get_width (v), format);
1565 if (*n_warnings == max_warnings)
1566 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1571 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1575 for (p = record->documents;
1576 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1577 p += DOC_LINE_LENGTH)
1579 struct substring line;
1581 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1582 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1583 ss_rtrim (&line, ss_cstr (" "));
1584 line.string[line.length] = '\0';
1586 dict_add_document_line (dict, line.string, false);
1592 /* Parses record type 7, subtype 3. */
1594 parse_machine_integer_info (struct sfm_reader *r,
1595 const struct sfm_extension_record *record,
1596 struct any_read_info *info)
1598 int float_representation, expected_float_format;
1599 int integer_representation, expected_integer_format;
1601 /* Save version info. */
1602 info->version_major = parse_int (r, record->data, 0);
1603 info->version_minor = parse_int (r, record->data, 4);
1604 info->version_revision = parse_int (r, record->data, 8);
1606 /* Check floating point format. */
1607 float_representation = parse_int (r, record->data, 16);
1608 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1609 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1610 expected_float_format = 1;
1611 else if (r->float_format == FLOAT_Z_LONG)
1612 expected_float_format = 2;
1613 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1614 expected_float_format = 3;
1617 if (float_representation != expected_float_format)
1619 sys_error (r, record->pos,
1620 _("Floating-point representation indicated by "
1621 "system file (%d) differs from expected (%d)."),
1622 float_representation, expected_float_format);
1626 /* Check integer format. */
1627 integer_representation = parse_int (r, record->data, 24);
1628 if (r->integer_format == INTEGER_MSB_FIRST)
1629 expected_integer_format = 1;
1630 else if (r->integer_format == INTEGER_LSB_FIRST)
1631 expected_integer_format = 2;
1634 if (integer_representation != expected_integer_format)
1635 sys_warn (r, record->pos,
1636 _("Integer format indicated by system file (%d) "
1637 "differs from expected (%d)."),
1638 integer_representation, expected_integer_format);
1643 /* Parses record type 7, subtype 4. */
1645 parse_machine_float_info (struct sfm_reader *r,
1646 const struct sfm_extension_record *record)
1648 double sysmis = parse_float (r, record->data, 0);
1649 double highest = parse_float (r, record->data, 8);
1650 double lowest = parse_float (r, record->data, 16);
1652 if (sysmis != SYSMIS)
1653 sys_warn (r, record->pos,
1654 _("File specifies unexpected value %g (%a) as %s, "
1655 "instead of %g (%a)."),
1656 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1658 if (highest != HIGHEST)
1659 sys_warn (r, record->pos,
1660 _("File specifies unexpected value %g (%a) as %s, "
1661 "instead of %g (%a)."),
1662 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1664 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1665 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1666 appears in a context (missing values) where SYSMIS cannot. */
1667 if (lowest != LOWEST && lowest != SYSMIS)
1668 sys_warn (r, record->pos,
1669 _("File specifies unexpected value %g (%a) as %s, "
1670 "instead of %g (%a) or %g (%a)."),
1671 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1674 /* Parses record type 7, subtype 10. */
1676 parse_extra_product_info (struct sfm_reader *r,
1677 const struct sfm_extension_record *record,
1678 struct any_read_info *info)
1680 struct text_record *text;
1682 text = open_text_record (r, record, true);
1683 info->product_ext = fix_line_ends (text_get_all (text));
1684 close_text_record (r, text);
1687 /* Parses record type 7, subtype 7 or 19. */
1689 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1690 size_t *allocated_mrsets)
1692 struct text_record *text;
1694 text = open_text_record (r, record, false);
1697 struct sfm_mrset *mrset = NULL;
1698 size_t allocated_vars = 0;
1699 char delimiter = '4';
1701 /* Skip extra line feeds if present. */
1702 while (text_match (text, '\n'))
1705 if (r->n_mrsets >= *allocated_mrsets)
1706 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1708 mrset = &r->mrsets[r->n_mrsets];
1709 memset(mrset, 0, sizeof *mrset);
1711 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1712 if (mrset->name == NULL)
1715 if (text_match (text, 'C'))
1717 mrset->type = MRSET_MC;
1718 if (!text_match (text, ' '))
1720 sys_warn (r, record->pos,
1721 _("Missing space following `%c' at offset %zu "
1722 "in MRSETS record."), 'C', text_pos (text));
1726 else if (text_match (text, 'D'))
1728 mrset->type = MRSET_MD;
1729 mrset->cat_source = MRSET_VARLABELS;
1731 else if (text_match (text, 'E'))
1735 mrset->type = MRSET_MD;
1736 mrset->cat_source = MRSET_COUNTEDVALUES;
1737 if (!text_match (text, ' '))
1739 sys_warn (r, record->pos,
1740 _("Missing space following `%c' at offset %zu "
1741 "in MRSETS record."), 'E', text_pos (text));
1745 number = text_get_token (text, ss_cstr (" "), NULL);
1747 sys_warn (r, record->pos,
1748 _("Missing label source value "
1749 "following `E' at offset %zu in MRSETS record."),
1751 else if (!strcmp (number, "11"))
1752 mrset->label_from_var_label = true;
1753 else if (strcmp (number, "1"))
1754 sys_warn (r, record->pos,
1755 _("Unexpected label source value following `E' "
1756 "at offset %zu in MRSETS record."),
1761 sys_warn (r, record->pos,
1762 _("Missing `C', `D', or `E' at offset %zu "
1763 "in MRSETS record."),
1768 if (mrset->type == MRSET_MD)
1770 mrset->counted = text_parse_counted_string (r, text);
1771 if (mrset->counted == NULL)
1775 mrset->label = text_parse_counted_string (r, text);
1776 if (mrset->label == NULL)
1784 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1787 if (delimiter != '\n')
1788 sys_warn (r, record->pos,
1789 _("Missing new-line parsing variable names "
1790 "at offset %zu in MRSETS record."),
1795 if (mrset->n_vars >= allocated_vars)
1796 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1798 sizeof *mrset->vars);
1799 mrset->vars[mrset->n_vars++] = var;
1801 while (delimiter != '\n');
1805 close_text_record (r, text);
1809 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1811 const struct sfm_mrset *s;
1813 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1815 struct stringi_set var_names;
1816 struct mrset *mrset;
1821 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1822 if (!mrset_is_valid_name (name, dict_get_encoding (dict), false))
1824 sys_warn (r, -1, _("Invalid multiple response set name `%s'."),
1830 mrset = xzalloc (sizeof *mrset);
1832 mrset->type = s->type;
1833 mrset->cat_source = s->cat_source;
1834 mrset->label_from_var_label = s->label_from_var_label;
1835 if (s->label[0] != '\0')
1836 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1838 stringi_set_init (&var_names);
1839 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1841 for (i = 0; i < s->n_vars; i++)
1843 struct variable *var;
1846 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1848 var = dict_lookup_var (dict, var_name);
1854 if (!stringi_set_insert (&var_names, var_name))
1857 _("MRSET %s contains duplicate variable name %s."),
1858 mrset->name, var_name);
1864 if (mrset->label == NULL && mrset->label_from_var_label
1865 && var_has_label (var))
1866 mrset->label = xstrdup (var_get_label (var));
1869 && var_get_type (var) != var_get_type (mrset->vars[0]))
1872 _("MRSET %s contains both string and "
1873 "numeric variables."), mrset->name);
1876 width = MIN (width, var_get_width (var));
1878 mrset->vars[mrset->n_vars++] = var;
1881 if (mrset->n_vars < 2)
1883 if (mrset->n_vars == 0)
1884 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1886 sys_warn (r, -1, _("MRSET %s has only one variable."),
1888 mrset_destroy (mrset);
1889 stringi_set_destroy (&var_names);
1893 if (mrset->type == MRSET_MD)
1895 mrset->width = width;
1896 value_init (&mrset->counted, width);
1898 mrset->counted.f = c_strtod (s->counted, NULL);
1900 value_copy_str_rpad (&mrset->counted, width,
1901 (const uint8_t *) s->counted, ' ');
1904 dict_add_mrset (dict, mrset);
1905 stringi_set_destroy (&var_names);
1909 /* Read record type 7, subtype 11, which specifies how variables
1910 should be displayed in GUI environments. */
1912 parse_display_parameters (struct sfm_reader *r,
1913 const struct sfm_extension_record *record,
1914 struct dictionary *dict)
1916 bool includes_width;
1917 bool warned = false;
1922 n_vars = dict_get_var_cnt (dict);
1923 if (record->count == 3 * n_vars)
1924 includes_width = true;
1925 else if (record->count == 2 * n_vars)
1926 includes_width = false;
1929 sys_warn (r, record->pos,
1930 _("Extension 11 has bad count %u (for %zu variables)."),
1931 record->count, n_vars);
1936 for (i = 0; i < n_vars; ++i)
1938 struct variable *v = dict_get_var (dict, i);
1939 int measure, width, align;
1941 measure = parse_int (r, record->data, ofs);
1946 width = parse_int (r, record->data, ofs);
1952 align = parse_int (r, record->data, ofs);
1955 /* SPSS sometimes seems to set variables' measure to zero. */
1959 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1962 sys_warn (r, record->pos,
1963 _("Invalid variable display parameters for variable "
1964 "%zu (%s). Default parameters substituted."),
1965 i, var_get_name (v));
1970 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1971 : measure == 2 ? MEASURE_ORDINAL
1973 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1974 : align == 1 ? ALIGN_RIGHT
1977 /* Older versions (SPSS 9.0) sometimes set the display
1978 width to zero. This causes confusion in the GUI, so
1979 only set the width if it is nonzero. */
1981 var_set_display_width (v, width);
1986 rename_var_and_save_short_names (struct sfm_reader *r, off_t pos,
1987 struct dictionary *dict,
1988 struct variable *var, const char *new_name)
1990 size_t n_short_names;
1994 /* Renaming a variable may clear its short names, but we
1995 want to retain them, so we save them and re-set them
1997 n_short_names = var_get_short_name_cnt (var);
1998 short_names = xnmalloc (n_short_names, sizeof *short_names);
1999 for (i = 0; i < n_short_names; i++)
2001 const char *s = var_get_short_name (var, i);
2002 short_names[i] = xstrdup_if_nonnull (s);
2005 /* Set long name. */
2006 if (!dict_try_rename_var (dict, var, new_name))
2007 sys_warn (r, pos, _("Duplicate long variable name `%s'."), new_name);
2009 /* Restore short names. */
2010 for (i = 0; i < n_short_names; i++)
2012 var_set_short_name (var, i, short_names[i]);
2013 free (short_names[i]);
2018 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2019 to each short name. Modifies variable names in DICT accordingly. */
2021 parse_long_var_name_map (struct sfm_reader *r,
2022 const struct sfm_extension_record *record,
2023 struct dictionary *dict)
2025 struct text_record *text;
2026 struct variable *var;
2031 /* There are no long variable names. Use the short variable names,
2032 converted to lowercase, as the long variable names. */
2035 for (i = 0; i < dict_get_var_cnt (dict); i++)
2037 struct variable *var = dict_get_var (dict, i);
2040 new_name = utf8_to_lower (var_get_name (var));
2041 rename_var_and_save_short_names (r, -1, dict, var, new_name);
2048 /* Rename each of the variables, one by one. (In a correctly constructed
2049 system file, this cannot create any intermediate duplicate variable names,
2050 because all of the new variable names are longer than any of the old
2051 variable names and thus there cannot be any overlaps.) */
2052 text = open_text_record (r, record, true);
2053 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2055 /* Validate long name. */
2056 if (!dict_id_is_valid (dict, long_name, false)
2057 || long_name[0] == '$' || long_name[0] == '#')
2059 sys_warn (r, record->pos,
2060 _("Long variable mapping from %s to invalid "
2061 "variable name `%s'."),
2062 var_get_name (var), long_name);
2066 rename_var_and_save_short_names (r, record->pos, dict, var, long_name);
2068 close_text_record (r, text);
2071 /* Reads record type 7, subtype 14, which gives the real length
2072 of each very long string. Rearranges DICT accordingly. */
2074 parse_long_string_map (struct sfm_reader *r,
2075 const struct sfm_extension_record *record,
2076 struct dictionary *dict)
2078 struct text_record *text;
2079 struct variable *var;
2082 text = open_text_record (r, record, true);
2083 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2085 size_t idx = var_get_dict_index (var);
2091 length = strtol (length_s, NULL, 10);
2092 if (length < 1 || length > MAX_STRING)
2094 sys_warn (r, record->pos,
2095 _("%s listed as string of invalid length %s "
2096 "in very long string record."),
2097 var_get_name (var), length_s);
2101 /* Check segments. */
2102 segment_cnt = sfm_width_to_segments (length);
2103 if (segment_cnt == 1)
2105 sys_warn (r, record->pos,
2106 _("%s listed in very long string record with width %s, "
2107 "which requires only one segment."),
2108 var_get_name (var), length_s);
2111 if (idx + segment_cnt > dict_get_var_cnt (dict))
2113 sys_error (r, record->pos,
2114 _("Very long string %s overflows dictionary."),
2115 var_get_name (var));
2119 /* Get the short names from the segments and check their
2121 for (i = 0; i < segment_cnt; i++)
2123 struct variable *seg = dict_get_var (dict, idx + i);
2124 int alloc_width = sfm_segment_alloc_width (length, i);
2125 int width = var_get_width (seg);
2128 var_set_short_name (var, i, var_get_short_name (seg, 0));
2129 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2131 sys_error (r, record->pos,
2132 _("Very long string with width %ld has segment %d "
2133 "of width %d (expected %d)."),
2134 length, i, width, alloc_width);
2138 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
2139 var_set_width (var, length);
2141 close_text_record (r, text);
2142 dict_compact_values (dict);
2147 #define MAX_LABEL_WARNINGS 5
2149 /* Displays a warning for offset OFFSET in the file. */
2151 value_label_warning (struct sfm_reader *r, off_t offset, int *n_label_warnings,
2152 const char *format, ...)
2154 if (++*n_label_warnings > MAX_LABEL_WARNINGS)
2159 va_start (args, format);
2160 sys_msg (r, offset, MW, format, args);
2164 #define MAX_LABEL_WARNINGS 5
2167 parse_one_value_label_set (struct sfm_reader *r, struct dictionary *dict,
2168 const struct sfm_var_record *var_recs,
2170 const struct sfm_value_label_record *record,
2171 int *n_label_warnings)
2174 = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2175 for (size_t i = 0; i < record->n_labels; i++)
2176 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2177 record->labels[i].label, -1,
2180 struct variable **vars = pool_nmalloc (r->pool,
2181 record->n_vars, sizeof *vars);
2182 unsigned int n_vars = 0;
2183 for (size_t i = 0; i < record->n_vars; i++)
2185 int idx = record->vars[i];
2186 if (idx < 1 || idx > n_var_recs)
2188 value_label_warning (
2189 r, record->pos, n_label_warnings,
2190 _("Value label variable index %d not in valid range 1...%zu."),
2195 const struct sfm_var_record *rec = &var_recs[idx - 1];
2196 if (rec->var == NULL)
2198 value_label_warning (
2199 r, record->pos, n_label_warnings,
2200 _("Value label variable index %d "
2201 "refers to long string continuation."), idx);
2205 vars[n_vars++] = rec->var;
2210 for (size_t i = 1; i < n_vars; i++)
2211 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2213 value_label_warning (
2214 r, record->pos, n_label_warnings,
2215 _("Variables associated with value label are not all of "
2216 "identical type. Variable %s is %s, but variable "
2218 var_get_name (vars[0]),
2219 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2220 var_get_name (vars[i]),
2221 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2225 for (size_t i = 0; i < n_vars; i++)
2227 struct variable *var = vars[i];
2228 int width = var_get_width (var);
2231 value_label_warning (
2232 r, record->pos, n_label_warnings,
2233 _("Value labels may not be added to long string "
2234 "variables (e.g. %s) using records types 3 and 4."),
2235 var_get_name (var));
2239 for (size_t j = 0; j < record->n_labels; j++)
2241 struct sfm_value_label *label = &record->labels[j];
2244 value_init (&value, width);
2246 value.f = parse_float (r, label->value, 0);
2248 memcpy (value.s, label->value, width);
2250 if (!var_add_value_label (var, &value, utf8_labels[j]))
2252 if (r->written_by_readstat)
2254 /* Ignore the problem. ReadStat is buggy and emits value
2255 labels whose values are longer than string variables'
2256 widths, that are identical in the actual width of the
2257 variable, e.g. both values "ABC123" and "ABC456" for a
2258 string variable with width 3. */
2260 else if (var_is_numeric (var))
2261 value_label_warning (r, record->pos, n_label_warnings,
2262 _("Duplicate value label for %g on %s."),
2263 value.f, var_get_name (var));
2265 value_label_warning (
2266 r, record->pos, n_label_warnings,
2267 _("Duplicate value label for `%.*s' on %s."),
2268 width, value.s, var_get_name (var));
2271 value_destroy (&value, width);
2275 pool_free (r->pool, vars);
2276 for (size_t i = 0; i < record->n_labels; i++)
2277 pool_free (r->pool, utf8_labels[i]);
2278 pool_free (r->pool, utf8_labels);
2282 parse_value_labels (struct sfm_reader *r, struct dictionary *dict)
2284 int n_label_warnings = 0;
2285 for (size_t i = 0; i < r->n_labels; i++)
2286 parse_one_value_label_set (r, dict, r->vars, r->n_vars, &r->labels[i],
2288 if (n_label_warnings > MAX_LABEL_WARNINGS)
2290 _("Suppressed %d additional warnings for value labels."),
2291 n_label_warnings - MAX_LABEL_WARNINGS);
2294 static struct variable *
2295 parse_weight_var (struct sfm_reader *r,
2296 const struct sfm_var_record *var_recs, size_t n_var_recs,
2299 off_t offset = 76; /* Offset to variable index in header. */
2301 if (idx < 1 || idx > n_var_recs)
2303 sys_warn (r, offset,
2304 _("Weight variable index %d not in valid range 1...%zu. "
2305 "Treating file as unweighted."),
2310 const struct sfm_var_record *rec = &var_recs[idx - 1];
2311 if (rec->var == NULL)
2313 sys_warn (r, offset,
2314 _("Weight variable index %d refers to long string "
2315 "continuation. Treating file as unweighted."), idx);
2319 struct variable *weight_var = rec->var;
2320 if (!var_is_numeric (weight_var))
2322 sys_warn (r, offset, _("Ignoring string variable `%s' set "
2323 "as weighting variable."),
2324 var_get_name (weight_var));
2331 /* Parses a set of custom attributes from TEXT into ATTRS.
2332 ATTRS may be a null pointer, in which case the attributes are
2333 read but discarded. */
2335 parse_attributes (struct sfm_reader *r, struct text_record *text,
2336 struct attrset *attrs)
2340 struct attribute *attr;
2344 /* Parse the key. */
2345 key = text_get_token (text, ss_cstr ("("), NULL);
2349 attr = attribute_create (key);
2350 for (index = 1; ; index++)
2352 /* Parse the value. */
2356 value = text_get_token (text, ss_cstr ("\n"), NULL);
2359 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2364 length = strlen (value);
2365 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2367 value[length - 1] = '\0';
2368 attribute_add_value (attr, value + 1);
2373 _("Attribute value %s[%d] is not quoted: %s."),
2375 attribute_add_value (attr, value);
2378 /* Was this the last value for this attribute? */
2379 if (text_match (text, ')'))
2382 if (attrs != NULL && attribute_get_n_values (attr) > 0)
2384 if (!attrset_try_add (attrs, attr))
2386 text_warn (r, text, _("Duplicate attribute %s."),
2387 attribute_get_name (attr));
2388 attribute_destroy (attr);
2392 attribute_destroy (attr);
2394 while (!text_match (text, '/'));
2397 /* Reads record type 7, subtype 17, which lists custom
2398 attributes on the data file. */
2400 parse_data_file_attributes (struct sfm_reader *r,
2401 const struct sfm_extension_record *record,
2402 struct dictionary *dict)
2404 struct text_record *text = open_text_record (r, record, true);
2405 parse_attributes (r, text, dict_get_attributes (dict));
2406 close_text_record (r, text);
2409 /* Parses record type 7, subtype 18, which lists custom
2410 attributes on individual variables. */
2412 parse_variable_attributes (struct sfm_reader *r,
2413 const struct sfm_extension_record *record,
2414 struct dictionary *dict)
2416 struct text_record *text;
2417 struct variable *var;
2419 text = open_text_record (r, record, true);
2420 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2421 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2422 close_text_record (r, text);
2426 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2428 size_t n_warnings = 0;
2431 for (i = 0; i < dict_get_var_cnt (dict); i++)
2433 struct variable *var = dict_get_var (dict, i);
2434 struct attrset *attrs = var_get_attributes (var);
2435 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2436 if (attr != NULL && attribute_get_n_values (attr) > 0)
2438 int value = atoi (attribute_get_value (attr, 0));
2460 role = ROLE_PARTITION;
2469 if (n_warnings++ == 0)
2470 sys_warn (r, -1, _("Invalid role for variable %s."),
2471 var_get_name (var));
2474 var_set_role (var, role);
2479 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2484 check_overflow (struct sfm_reader *r,
2485 const struct sfm_extension_record *record,
2486 size_t ofs, size_t length)
2488 size_t end = record->size * record->count;
2489 if (length >= end || ofs + length > end)
2491 sys_warn (r, record->pos + end,
2492 _("Extension record subtype %d ends unexpectedly."),
2500 parse_long_string_value_labels (struct sfm_reader *r,
2501 const struct sfm_extension_record *record,
2502 struct dictionary *dict)
2504 const char *dict_encoding = dict_get_encoding (dict);
2505 size_t end = record->size * record->count;
2512 struct variable *var;
2517 /* Parse variable name length. */
2518 if (!check_overflow (r, record, ofs, 4))
2520 var_name_len = parse_int (r, record->data, ofs);
2523 /* Parse variable name, width, and number of labels. */
2524 if (!check_overflow (r, record, ofs, var_name_len)
2525 || !check_overflow (r, record, ofs, var_name_len + 8))
2527 var_name = recode_string_pool ("UTF-8", dict_encoding,
2528 (const char *) record->data + ofs,
2529 var_name_len, r->pool);
2530 width = parse_int (r, record->data, ofs + var_name_len);
2531 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2532 ofs += var_name_len + 8;
2534 /* Look up 'var' and validate. */
2535 var = dict_lookup_var (dict, var_name);
2537 sys_warn (r, record->pos + ofs,
2538 _("Ignoring long string value label record for "
2539 "unknown variable %s."), var_name);
2540 else if (var_is_numeric (var))
2542 sys_warn (r, record->pos + ofs,
2543 _("Ignoring long string value label record for "
2544 "numeric variable %s."), var_name);
2547 else if (width != var_get_width (var))
2549 sys_warn (r, record->pos + ofs,
2550 _("Ignoring long string value label record for variable "
2551 "%s because the record's width (%d) does not match the "
2552 "variable's width (%d)."),
2553 var_name, width, var_get_width (var));
2558 value_init_pool (r->pool, &value, width);
2559 for (i = 0; i < n_labels; i++)
2561 size_t value_length, label_length;
2562 bool skip = var == NULL;
2564 /* Parse value length. */
2565 if (!check_overflow (r, record, ofs, 4))
2567 value_length = parse_int (r, record->data, ofs);
2571 if (!check_overflow (r, record, ofs, value_length))
2575 if (value_length == width)
2576 memcpy (value.s, (const uint8_t *) record->data + ofs, width);
2579 sys_warn (r, record->pos + ofs,
2580 _("Ignoring long string value label %zu for "
2581 "variable %s, with width %d, that has bad value "
2583 i, var_get_name (var), width, value_length);
2587 ofs += value_length;
2589 /* Parse label length. */
2590 if (!check_overflow (r, record, ofs, 4))
2592 label_length = parse_int (r, record->data, ofs);
2596 if (!check_overflow (r, record, ofs, label_length))
2602 label = recode_string_pool ("UTF-8", dict_encoding,
2603 (const char *) record->data + ofs,
2604 label_length, r->pool);
2605 if (!var_add_value_label (var, &value, label))
2606 sys_warn (r, record->pos + ofs,
2607 _("Duplicate value label for `%.*s' on %s."),
2608 width, value.s, var_get_name (var));
2609 pool_free (r->pool, label);
2611 ofs += label_length;
2617 parse_long_string_missing_values (struct sfm_reader *r,
2618 const struct sfm_extension_record *record,
2619 struct dictionary *dict)
2621 const char *dict_encoding = dict_get_encoding (dict);
2622 size_t end = record->size * record->count;
2627 struct missing_values mv;
2629 struct variable *var;
2630 int n_missing_values;
2634 /* Parse variable name length. */
2635 if (!check_overflow (r, record, ofs, 4))
2637 var_name_len = parse_int (r, record->data, ofs);
2640 /* Parse variable name. */
2641 if (!check_overflow (r, record, ofs, var_name_len)
2642 || !check_overflow (r, record, ofs, var_name_len + 1))
2644 var_name = recode_string_pool ("UTF-8", dict_encoding,
2645 (const char *) record->data + ofs,
2646 var_name_len, r->pool);
2647 ofs += var_name_len;
2649 /* Parse number of missing values. */
2650 n_missing_values = ((const uint8_t *) record->data)[ofs];
2651 if (n_missing_values < 1 || n_missing_values > 3)
2652 sys_warn (r, record->pos + ofs,
2653 _("Long string missing values record says variable %s "
2654 "has %d missing values, but only 1 to 3 missing values "
2656 var_name, n_missing_values);
2659 /* Look up 'var' and validate. */
2660 var = dict_lookup_var (dict, var_name);
2662 sys_warn (r, record->pos + ofs,
2663 _("Ignoring long string missing value record for "
2664 "unknown variable %s."), var_name);
2665 else if (var_is_numeric (var))
2667 sys_warn (r, record->pos + ofs,
2668 _("Ignoring long string missing value record for "
2669 "numeric variable %s."), var_name);
2674 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2675 for (i = 0; i < n_missing_values; i++)
2677 size_t value_length;
2679 /* Parse value length. */
2680 if (!check_overflow (r, record, ofs, 4))
2682 value_length = parse_int (r, record->data, ofs);
2686 if (!check_overflow (r, record, ofs, value_length))
2690 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2692 sys_warn (r, record->pos + ofs,
2693 _("Ignoring long string missing value %zu for variable "
2694 "%s, with width %d, that has bad value width %zu."),
2695 i, var_get_name (var), var_get_width (var),
2697 ofs += value_length;
2700 var_set_missing_values (var, &mv);
2706 static void partial_record (struct sfm_reader *);
2708 static void read_error (struct casereader *, const struct sfm_reader *);
2710 static bool read_case_number (struct sfm_reader *, double *);
2711 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2712 static int read_opcode (struct sfm_reader *);
2713 static bool read_compressed_number (struct sfm_reader *, double *);
2714 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2715 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2716 static bool skip_whole_strings (struct sfm_reader *, size_t);
2718 /* Reads and returns one case from READER's file. Returns a null
2719 pointer if not successful. */
2720 static struct ccase *
2721 sys_file_casereader_read (struct casereader *reader, void *r_)
2723 struct sfm_reader *r = r_;
2728 if (r->error || !r->sfm_var_cnt)
2731 c = case_create (r->proto);
2733 for (i = 0; i < r->sfm_var_cnt; i++)
2735 struct sfm_var *sv = &r->sfm_vars[i];
2736 union value *v = case_data_rw_idx (c, sv->case_index);
2738 if (sv->var_width == 0)
2739 retval = read_case_number (r, &v->f);
2742 retval = read_case_string (r, v->s + sv->offset, sv->segment_width);
2745 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2747 sys_error (r, r->pos, _("File ends in partial string value."));
2759 if (r->case_cnt != -1)
2760 read_error (reader, r);
2765 /* Issues an error that R ends in a partial record. */
2767 partial_record (struct sfm_reader *r)
2769 sys_error (r, r->pos, _("File ends in partial case."));
2772 /* Issues an error that an unspecified error occurred SFM, and
2775 read_error (struct casereader *r, const struct sfm_reader *sfm)
2777 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2778 casereader_force_error (r);
2781 /* Reads a number from R and stores its value in *D.
2782 If R is compressed, reads a compressed number;
2783 otherwise, reads a number in the regular way.
2784 Returns true if successful, false if end of file is
2785 reached immediately. */
2787 read_case_number (struct sfm_reader *r, double *d)
2789 if (r->compression == ANY_COMP_NONE)
2792 if (!try_read_bytes (r, number, sizeof number))
2794 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2798 return read_compressed_number (r, d);
2801 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2802 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2803 discarded without being written to S. Reads compressed strings if S is
2804 compressed. Returns 1 if successful, 0 if end of file is reached
2805 immediately, or -1 for some kind of error. */
2807 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2809 size_t whole = ROUND_DOWN (length, 8);
2810 size_t partial = length % 8;
2814 int retval = read_whole_strings (r, s, whole);
2822 int retval = read_whole_strings (r, bounce, sizeof bounce);
2834 memcpy (s + whole, bounce, partial);
2840 /* Reads and returns the next compression opcode from R. */
2842 read_opcode (struct sfm_reader *r)
2844 assert (r->compression != ANY_COMP_NONE);
2848 if (r->opcode_idx >= sizeof r->opcodes)
2851 int retval = try_read_compressed_bytes (r, r->opcodes,
2857 opcode = r->opcodes[r->opcode_idx++];
2864 /* Reads a compressed number from R and stores its value in D.
2865 Returns true if successful, false if end of file is
2866 reached immediately. */
2868 read_compressed_number (struct sfm_reader *r, double *d)
2870 int opcode = read_opcode (r);
2878 return read_compressed_float (r, d);
2881 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2882 if (!r->corruption_warning)
2884 r->corruption_warning = true;
2885 sys_warn (r, r->pos,
2886 _("Possible compressed data corruption: "
2887 "compressed spaces appear in numeric field."));
2896 *d = opcode - r->bias;
2903 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2905 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2910 opcode = read_opcode (r);
2918 retval = read_compressed_bytes (r, dst, 8);
2919 return retval == 1 ? 1 : -1;
2922 memset (dst, ' ', 8);
2927 double value = opcode - r->bias;
2928 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2931 /* This has actually been seen "in the wild". The submitter of the
2932 file that showed that the contents decoded as spaces, but they
2933 were at the end of the field so it's possible that the null
2934 bytes just acted as null terminators. */
2936 else if (!r->corruption_warning)
2938 r->corruption_warning = true;
2939 sys_warn (r, r->pos,
2940 _("Possible compressed data corruption: "
2941 "string contains compressed integer (opcode %d)."),
2949 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2950 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2951 end of file is reached immediately, or -1 for some kind of error. */
2953 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2955 assert (length % 8 == 0);
2956 if (r->compression == ANY_COMP_NONE)
2957 return try_read_bytes (r, s, length);
2962 for (ofs = 0; ofs < length; ofs += 8)
2964 int retval = read_compressed_string (r, s + ofs);
2979 /* Skips LENGTH string bytes from R.
2980 LENGTH must be a multiple of 8.
2981 (LENGTH is also limited to 1024, but that's only because the
2982 current caller never needs more than that many bytes.)
2983 Returns true if successful, false if end of file is
2984 reached immediately. */
2986 skip_whole_strings (struct sfm_reader *r, size_t length)
2988 uint8_t buffer[1024];
2989 assert (length < sizeof buffer);
2990 return read_whole_strings (r, buffer, length);
2993 /* Helpers for reading records that contain structured text
2996 /* Maximum number of warnings to issue for a single text
2998 #define MAX_TEXT_WARNINGS 5
3003 struct substring buffer; /* Record contents. */
3004 off_t start; /* Starting offset in file. */
3005 size_t pos; /* Current position in buffer. */
3006 int n_warnings; /* Number of warnings issued or suppressed. */
3007 bool recoded; /* Recoded into UTF-8? */
3010 static struct text_record *
3011 open_text_record (struct sfm_reader *r,
3012 const struct sfm_extension_record *record,
3013 bool recode_to_utf8)
3015 struct text_record *text;
3016 struct substring raw;
3018 text = pool_alloc (r->pool, sizeof *text);
3019 raw = ss_buffer (record->data, record->size * record->count);
3020 text->start = record->pos;
3021 text->buffer = (recode_to_utf8
3022 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
3025 text->n_warnings = 0;
3026 text->recoded = recode_to_utf8;
3031 /* Closes TEXT, frees its storage, and issues a final warning
3032 about suppressed warnings if necessary. */
3034 close_text_record (struct sfm_reader *r, struct text_record *text)
3036 if (text->n_warnings > MAX_TEXT_WARNINGS)
3037 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
3038 text->n_warnings - MAX_TEXT_WARNINGS);
3040 pool_free (r->pool, ss_data (text->buffer));
3043 /* Reads a variable=value pair from TEXT.
3044 Looks up the variable in DICT and stores it into *VAR.
3045 Stores a null-terminated value into *VALUE. */
3047 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
3048 struct text_record *text,
3049 struct variable **var, char **value)
3053 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
3056 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
3060 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
3061 ss_buffer ("\t\0", 2));
3069 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
3070 struct text_record *text, struct substring delimiters,
3071 struct variable **var)
3075 name = text_get_token (text, delimiters, NULL);
3079 *var = dict_lookup_var (dict, name);
3083 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3090 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3091 struct text_record *text, struct substring delimiters,
3092 struct variable **var)
3094 char *short_name = text_get_token (text, delimiters, NULL);
3095 if (short_name == NULL)
3098 *var = dict_lookup_var (dict, short_name);
3100 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3105 /* Displays a warning for the current file position, limiting the
3106 number to MAX_TEXT_WARNINGS for TEXT. */
3108 text_warn (struct sfm_reader *r, struct text_record *text,
3109 const char *format, ...)
3111 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3115 va_start (args, format);
3116 sys_msg (r, text->start + text->pos, MW, format, args);
3122 text_get_token (struct text_record *text, struct substring delimiters,
3125 struct substring token;
3128 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3130 if (delimiter != NULL)
3131 *delimiter = ss_data (text->buffer)[text->pos-1];
3135 end = &ss_data (token)[ss_length (token)];
3136 if (delimiter != NULL)
3139 return ss_data (token);
3142 /* Reads a integer value expressed in decimal, then a space, then a string that
3143 consists of exactly as many bytes as specified by the integer, then a space,
3144 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3145 buffer (so the caller should not free the string). */
3147 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3155 while (text->pos < text->buffer.length)
3157 int c = text->buffer.string[text->pos];
3158 if (c < '0' || c > '9')
3160 n = (n * 10) + (c - '0');
3163 if (text->pos >= text->buffer.length || start == text->pos)
3165 sys_warn (r, text->start,
3166 _("Expecting digit at offset %zu in MRSETS record."),
3171 if (!text_match (text, ' '))
3173 sys_warn (r, text->start,
3174 _("Expecting space at offset %zu in MRSETS record."),
3179 if (text->pos + n > text->buffer.length)
3181 sys_warn (r, text->start,
3182 _("%zu-byte string starting at offset %zu "
3183 "exceeds record length %zu."),
3184 n, text->pos, text->buffer.length);
3188 s = &text->buffer.string[text->pos];
3191 sys_warn (r, text->start,
3192 _("Expecting space at offset %zu following %zu-byte string."),
3202 text_match (struct text_record *text, char c)
3204 if (text->pos >= text->buffer.length)
3207 if (text->buffer.string[text->pos] == c)
3216 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3217 inside the TEXT's string. */
3219 text_pos (const struct text_record *text)
3225 text_get_all (const struct text_record *text)
3227 return text->buffer.string;
3232 /* Displays a corruption message. */
3234 sys_msg (struct sfm_reader *r, off_t offset,
3235 int class, const char *format, va_list args)
3239 ds_init_empty (&text);
3241 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3242 fh_get_file_name (r->fh), (long long int) offset);
3244 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3245 ds_put_vformat (&text, format, args);
3248 .category = msg_class_to_category (class),
3249 .severity = msg_class_to_severity (class),
3250 .text = ds_cstr (&text),
3255 /* Displays a warning for offset OFFSET in the file. */
3257 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3261 va_start (args, format);
3262 sys_msg (r, offset, MW, format, args);
3266 /* Displays an error for the current file position and marks it as in an error
3269 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3273 va_start (args, format);
3274 sys_msg (r, offset, ME, format, args);
3280 /* Reads BYTE_CNT bytes into BUF.
3281 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3282 Returns -1 if an I/O error or a partial read occurs.
3283 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3286 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3287 void *buf, size_t byte_cnt)
3289 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3290 r->pos += bytes_read;
3291 if (bytes_read == byte_cnt)
3293 else if (ferror (r->file))
3295 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3298 else if (!eof_is_ok || bytes_read != 0)
3300 sys_error (r, r->pos, _("Unexpected end of file."));
3307 /* Reads BYTE_CNT into BUF.
3308 Returns true if successful.
3309 Returns false upon I/O error or if end-of-file is encountered. */
3311 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3313 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3316 /* Reads BYTE_CNT bytes into BUF.
3317 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3318 Returns 0 if an immediate end-of-file is encountered.
3319 Returns -1 if an I/O error or a partial read occurs. */
3321 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3323 return read_bytes_internal (r, true, buf, byte_cnt);
3326 /* Reads a 32-bit signed integer from R and stores its value in host format in
3327 *X. Returns true if successful, otherwise false. */
3329 read_int (struct sfm_reader *r, int *x)
3332 if (read_bytes (r, integer, sizeof integer) != 1)
3334 *x = integer_get (r->integer_format, integer, sizeof integer);
3339 read_uint (struct sfm_reader *r, unsigned int *x)
3344 ok = read_int (r, &y);
3349 /* Reads a 64-bit signed integer from R and returns its value in
3352 read_int64 (struct sfm_reader *r, long long int *x)
3355 if (read_bytes (r, integer, sizeof integer) != 1)
3357 *x = integer_get (r->integer_format, integer, sizeof integer);
3361 /* Reads a 64-bit signed integer from R and returns its value in
3364 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3369 ok = read_int64 (r, &y);
3375 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3377 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3381 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3383 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3386 /* Reads exactly SIZE - 1 bytes into BUFFER
3387 and stores a null byte into BUFFER[SIZE - 1]. */
3389 read_string (struct sfm_reader *r, char *buffer, size_t size)
3394 ok = read_bytes (r, buffer, size - 1);
3396 buffer[size - 1] = '\0';
3400 /* Skips BYTES bytes forward in R. */
3402 skip_bytes (struct sfm_reader *r, size_t bytes)
3407 size_t chunk = MIN (sizeof buffer, bytes);
3408 if (!read_bytes (r, buffer, chunk))
3416 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3417 been replaced by LFs.
3419 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3420 files that use CR-only line ends in the file label and extra product
3423 fix_line_ends (const char *s)
3427 d = dst = xmalloc (strlen (s) + 1);
3446 read_ztrailer (struct sfm_reader *r,
3447 long long int zheader_ofs,
3448 long long int ztrailer_len);
3451 zalloc (voidpf pool_, uInt items, uInt size)
3453 struct pool *pool = pool_;
3455 return (!size || xalloc_oversized (items, size)
3457 : pool_malloc (pool, items * size));
3461 zfree (voidpf pool_, voidpf address)
3463 struct pool *pool = pool_;
3465 pool_free (pool, address);
3469 read_zheader (struct sfm_reader *r)
3472 long long int zheader_ofs;
3473 long long int ztrailer_ofs;
3474 long long int ztrailer_len;
3476 if (!read_int64 (r, &zheader_ofs)
3477 || !read_int64 (r, &ztrailer_ofs)
3478 || !read_int64 (r, &ztrailer_len))
3481 if (zheader_ofs != pos)
3483 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3484 "(expected %#llx)."),
3485 zheader_ofs, (long long int) pos);
3489 if (ztrailer_ofs < r->pos)
3491 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3496 if (ztrailer_len < 24 || ztrailer_len % 24)
3498 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3502 r->ztrailer_ofs = ztrailer_ofs;
3503 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3506 if (r->zin_buf == NULL)
3508 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3509 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3510 r->zstream.next_in = NULL;
3511 r->zstream.avail_in = 0;
3514 r->zstream.zalloc = zalloc;
3515 r->zstream.zfree = zfree;
3516 r->zstream.opaque = r->pool;
3518 return open_zstream (r);
3522 seek (struct sfm_reader *r, off_t offset)
3524 if (fseeko (r->file, offset, SEEK_SET))
3525 sys_error (r, 0, _("%s: seek failed (%s)."),
3526 fh_get_file_name (r->fh), strerror (errno));
3530 /* Performs some additional consistency checks on the ZLIB compressed data
3533 read_ztrailer (struct sfm_reader *r,
3534 long long int zheader_ofs,
3535 long long int ztrailer_len)
3537 long long int expected_uncmp_ofs;
3538 long long int expected_cmp_ofs;
3541 unsigned int block_size;
3542 unsigned int n_blocks;
3546 if (fstat (fileno (r->file), &s))
3548 sys_error (r, 0, _("%s: stat failed (%s)."),
3549 fh_get_file_name (r->fh), strerror (errno));
3553 if (!S_ISREG (s.st_mode))
3555 /* We can't seek to the trailer and then back to the data in this file,
3556 so skip doing extra checks. */
3560 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3561 sys_warn (r, r->pos,
3562 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3563 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3565 seek (r, r->ztrailer_ofs);
3567 /* Read fixed header from ZLIB data trailer. */
3568 if (!read_int64 (r, &bias))
3570 if (-bias != r->bias)
3572 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3573 "file header bias (%.2f)."),
3578 if (!read_int64 (r, &zero))
3581 sys_warn (r, r->pos,
3582 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3584 if (!read_uint (r, &block_size))
3586 if (block_size != ZBLOCK_SIZE)
3587 sys_warn (r, r->pos,
3588 _("ZLIB trailer specifies unexpected %u-byte block size."),
3591 if (!read_uint (r, &n_blocks))
3593 if (n_blocks != (ztrailer_len - 24) / 24)
3595 sys_error (r, r->pos,
3596 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3598 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3602 expected_uncmp_ofs = zheader_ofs;
3603 expected_cmp_ofs = zheader_ofs + 24;
3604 for (i = 0; i < n_blocks; i++)
3606 off_t desc_ofs = r->pos;
3607 unsigned long long int uncompressed_ofs;
3608 unsigned long long int compressed_ofs;
3609 unsigned int uncompressed_size;
3610 unsigned int compressed_size;
3612 if (!read_uint64 (r, &uncompressed_ofs)
3613 || !read_uint64 (r, &compressed_ofs)
3614 || !read_uint (r, &uncompressed_size)
3615 || !read_uint (r, &compressed_size))
3618 if (uncompressed_ofs != expected_uncmp_ofs)
3620 sys_error (r, desc_ofs,
3621 _("ZLIB block descriptor %u reported uncompressed data "
3622 "offset %#llx, when %#llx was expected."),
3623 i, uncompressed_ofs, expected_uncmp_ofs);
3627 if (compressed_ofs != expected_cmp_ofs)
3629 sys_error (r, desc_ofs,
3630 _("ZLIB block descriptor %u reported compressed data "
3631 "offset %#llx, when %#llx was expected."),
3632 i, compressed_ofs, expected_cmp_ofs);
3636 if (i < n_blocks - 1)
3638 if (uncompressed_size != block_size)
3639 sys_warn (r, desc_ofs,
3640 _("ZLIB block descriptor %u reported block size %#x, "
3641 "when %#x was expected."),
3642 i, uncompressed_size, block_size);
3646 if (uncompressed_size > block_size)
3647 sys_warn (r, desc_ofs,
3648 _("ZLIB block descriptor %u reported block size %#x, "
3649 "when at most %#x was expected."),
3650 i, uncompressed_size, block_size);
3653 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3654 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3655 This code checks for an expansion of more than 14.3% plus 11
3657 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3659 sys_error (r, desc_ofs,
3660 _("ZLIB block descriptor %u reports compressed size %u "
3661 "and uncompressed size %u."),
3662 i, compressed_size, uncompressed_size);
3666 expected_uncmp_ofs += uncompressed_size;
3667 expected_cmp_ofs += compressed_size;
3670 if (expected_cmp_ofs != r->ztrailer_ofs)
3672 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3673 "would be expected from block descriptors."),
3674 r->ztrailer_ofs, expected_cmp_ofs);
3678 seek (r, zheader_ofs + 24);
3683 open_zstream (struct sfm_reader *r)
3687 r->zout_pos = r->zout_end = 0;
3688 error = inflateInit (&r->zstream);
3691 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3699 close_zstream (struct sfm_reader *r)
3703 error = inflateEnd (&r->zstream);
3706 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3714 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3716 uint8_t *buf = buf_;
3725 /* Use already inflated data if there is any. */
3726 if (r->zout_pos < r->zout_end)
3728 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3729 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3738 /* We need to inflate some more data.
3739 Get some more input data if we don't have any. */
3740 if (r->zstream.avail_in == 0)
3742 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3747 int retval = try_read_bytes (r, r->zin_buf, n);
3750 r->zstream.avail_in = n;
3751 r->zstream.next_in = r->zin_buf;
3755 /* Inflate the (remaining) input data. */
3756 r->zstream.avail_out = ZOUT_BUF_SIZE;
3757 r->zstream.next_out = r->zout_buf;
3758 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3760 r->zout_end = r->zstream.next_out - r->zout_buf;
3761 if (r->zout_end == 0)
3763 if (error != Z_STREAM_END)
3765 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3769 else if (!close_zstream (r) || !open_zstream (r))
3774 /* Process the output data and ignore 'error' for now. ZLIB will
3775 present it to us again on the next inflate() call. */
3781 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3783 if (r->compression == ANY_COMP_SIMPLE)
3784 return read_bytes (r, buf, byte_cnt);
3787 int retval = read_bytes_zlib (r, buf, byte_cnt);
3789 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3795 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3797 if (r->compression == ANY_COMP_SIMPLE)
3798 return try_read_bytes (r, buf, byte_cnt);
3800 return read_bytes_zlib (r, buf, byte_cnt);
3803 /* Reads a 64-bit floating-point number from R and returns its
3804 value in host format. */
3806 read_compressed_float (struct sfm_reader *r, double *d)
3810 if (!read_compressed_bytes (r, number, sizeof number))
3813 *d = float_get_double (r->float_format, number);
3817 static const struct casereader_class sys_file_casereader_class =
3819 sys_file_casereader_read,
3820 sys_file_casereader_destroy,
3825 const struct any_reader_class sys_file_reader_class =
3827 N_("SPSS System File"),