1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-private.h"
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/ll.h"
49 #include "libpspp/message.h"
50 #include "libpspp/misc.h"
51 #include "libpspp/pool.h"
52 #include "libpspp/str.h"
53 #include "libpspp/stringi-set.h"
55 #include "gl/c-strtod.h"
56 #include "gl/c-ctype.h"
57 #include "gl/inttostr.h"
58 #include "gl/localcharset.h"
59 #include "gl/minmax.h"
60 #include "gl/unlocked-io.h"
61 #include "gl/xalloc.h"
62 #include "gl/xalloc-oversized.h"
66 #define _(msgid) gettext (msgid)
67 #define N_(msgid) (msgid)
71 /* subtypes 0-2 unknown */
72 EXT_INTEGER = 3, /* Machine integer info. */
73 EXT_FLOAT = 4, /* Machine floating-point info. */
74 EXT_VAR_SETS = 5, /* Variable sets. */
75 EXT_DATE = 6, /* DATE. */
76 EXT_MRSETS = 7, /* Multiple response sets. */
77 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
78 /* subtype 9 unknown */
79 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
80 EXT_DISPLAY = 11, /* Variable display parameters. */
81 /* subtype 12 unknown */
82 EXT_LONG_NAMES = 13, /* Long variable names. */
83 EXT_LONG_STRINGS = 14, /* Long strings. */
84 /* subtype 15 unknown */
85 EXT_NCASES = 16, /* Extended number of cases. */
86 EXT_FILE_ATTRS = 17, /* Data file attributes. */
87 EXT_VAR_ATTRS = 18, /* Variable attributes. */
88 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
89 EXT_ENCODING = 20, /* Character encoding. */
90 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
91 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
92 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
95 /* Fields from the top-level header record. */
96 struct sfm_header_record
98 char magic[5]; /* First 4 bytes of file, then null. */
99 int weight_idx; /* 0 if unweighted, otherwise a var index. */
100 int nominal_case_size; /* Number of var positions. */
102 /* These correspond to the members of struct any_file_info or a dictionary
103 but in the system file's encoding rather than ASCII. */
104 char creation_date[10]; /* "dd mmm yy". */
105 char creation_time[9]; /* "hh:mm:ss". */
106 char eye_catcher[61]; /* Eye-catcher string, then product name. */
107 char file_label[65]; /* File label. */
110 struct sfm_var_record
117 int missing_value_code;
120 struct variable *var;
123 struct sfm_value_label
129 struct sfm_value_label_record
132 struct sfm_value_label *labels;
133 unsigned int n_labels;
139 struct sfm_document_record
148 const char *name; /* Name. */
149 const char *label; /* Human-readable label for group. */
150 enum mrset_type type; /* Group type. */
151 const char **vars; /* Constituent variables' names. */
152 size_t n_vars; /* Number of constituent variables. */
155 enum mrset_md_cat_source cat_source; /* Source of category labels. */
156 bool label_from_var_label; /* 'label' taken from variable label? */
157 const char *counted; /* Counted value, as string. */
160 struct sfm_extension_record
162 struct ll ll; /* In struct sfm_reader 'var_attrs' list. */
163 int subtype; /* Record subtype. */
164 off_t pos; /* Starting offset in file. */
165 unsigned int size; /* Size of data elements. */
166 unsigned int count; /* Number of data elements. */
167 void *data; /* Contents. */
170 /* System file reader. */
173 struct any_reader any_reader;
175 /* Resource tracking. */
176 struct pool *pool; /* All system file state. */
179 struct any_read_info info;
180 struct sfm_header_record header;
181 struct sfm_var_record *vars;
183 struct sfm_value_label_record *labels;
185 struct sfm_document_record *document;
186 struct sfm_mrset *mrsets;
188 struct sfm_extension_record *extensions[32];
189 struct ll_list var_attrs; /* Contains "struct sfm_extension_record"s. */
192 struct file_handle *fh; /* File handle. */
193 struct fh_lock *lock; /* Mutual exclusion for file handle. */
194 FILE *file; /* File stream. */
195 off_t pos; /* Position in file. */
196 bool error; /* I/O or corruption error? */
197 struct caseproto *proto; /* Format of output cases. */
200 enum integer_format integer_format; /* On-disk integer format. */
201 enum float_format float_format; /* On-disk floating point format. */
202 struct sfm_var *sfm_vars; /* Variables. */
203 size_t sfm_var_cnt; /* Number of variables. */
204 int case_cnt; /* Number of cases */
205 const char *encoding; /* String encoding. */
206 bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */
209 enum any_compression compression;
210 double bias; /* Compression bias, usually 100.0. */
211 uint8_t opcodes[8]; /* Current block of opcodes. */
212 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
213 bool corruption_warning; /* Warned about possible corruption? */
215 /* ZLIB decompression. */
216 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
217 #define ZIN_BUF_SIZE 4096
218 uint8_t *zin_buf; /* Inflation input buffer. */
219 #define ZOUT_BUF_SIZE 16384
220 uint8_t *zout_buf; /* Inflation output buffer. */
221 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
222 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
223 z_stream zstream; /* ZLIB inflater. */
226 static const struct casereader_class sys_file_casereader_class;
228 static struct sfm_reader *
229 sfm_reader_cast (const struct any_reader *r_)
231 assert (r_->klass == &sys_file_reader_class);
232 return UP_CAST (r_, struct sfm_reader, any_reader);
235 static bool sfm_close (struct any_reader *);
237 static void sys_msg (struct sfm_reader *r, off_t, int class,
238 const char *format, va_list args)
239 PRINTF_FORMAT (4, 0);
240 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
241 PRINTF_FORMAT (3, 4);
242 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
243 PRINTF_FORMAT (3, 4);
245 static bool read_bytes (struct sfm_reader *, void *, size_t)
247 static int try_read_bytes (struct sfm_reader *, void *, size_t)
249 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
250 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
251 static bool read_int64 (struct sfm_reader *, long long int *)
253 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
255 static bool read_string (struct sfm_reader *, char *, size_t)
257 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
259 /* ZLIB compressed data handling. */
260 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
261 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
262 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
263 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
265 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
267 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
269 static bool read_compressed_float (struct sfm_reader *, double *)
272 static char *fix_line_ends (const char *);
274 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
275 static double parse_float (const struct sfm_reader *,
276 const void *data, size_t ofs);
278 static bool read_variable_record (struct sfm_reader *,
279 struct sfm_var_record *);
280 static bool read_value_label_record (struct sfm_reader *,
281 struct sfm_value_label_record *);
282 static bool read_document_record (struct sfm_reader *);
283 static bool read_extension_record (struct sfm_reader *, int subtype,
284 struct sfm_extension_record **);
285 static bool skip_extension_record (struct sfm_reader *, int subtype);
287 static struct text_record *open_text_record (
288 struct sfm_reader *, const struct sfm_extension_record *,
289 bool recode_to_utf8);
290 static void close_text_record (struct sfm_reader *,
291 struct text_record *);
292 static bool read_variable_to_value_pair (struct sfm_reader *,
294 struct text_record *,
295 struct variable **var, char **value);
296 static void text_warn (struct sfm_reader *r, struct text_record *text,
297 const char *format, ...) PRINTF_FORMAT (3, 4);
298 static char *text_get_token (struct text_record *,
299 struct substring delimiters, char *delimiter);
300 static bool text_match (struct text_record *, char c);
301 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
302 struct text_record *,
303 struct substring delimiters,
305 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
306 struct text_record *,
307 struct substring delimiters,
309 static const char *text_parse_counted_string (struct sfm_reader *,
310 struct text_record *);
311 static size_t text_pos (const struct text_record *);
312 static const char *text_get_all (const struct text_record *);
314 /* Dictionary reader. */
322 static bool read_dictionary (struct sfm_reader *);
323 static bool read_record (struct sfm_reader *, int type,
324 size_t *allocated_vars, size_t *allocated_labels);
325 static bool read_header (struct sfm_reader *, struct any_read_info *,
326 struct sfm_header_record *);
327 static void parse_header (struct sfm_reader *,
328 const struct sfm_header_record *,
329 struct any_read_info *, struct dictionary *);
330 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
331 struct sfm_var_record *, size_t n);
332 static void parse_format_spec (struct sfm_reader *, off_t pos,
333 unsigned int format, enum which_format,
334 struct variable *, int *format_warning_cnt);
335 static void parse_document (struct dictionary *, struct sfm_document_record *);
336 static void parse_display_parameters (struct sfm_reader *,
337 const struct sfm_extension_record *,
338 struct dictionary *);
339 static bool parse_machine_integer_info (struct sfm_reader *,
340 const struct sfm_extension_record *,
341 struct any_read_info *);
342 static void parse_machine_float_info (struct sfm_reader *,
343 const struct sfm_extension_record *);
344 static void parse_extra_product_info (struct sfm_reader *,
345 const struct sfm_extension_record *,
346 struct any_read_info *);
347 static void parse_mrsets (struct sfm_reader *,
348 const struct sfm_extension_record *,
349 size_t *allocated_mrsets);
350 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
351 static void parse_long_var_name_map (struct sfm_reader *,
352 const struct sfm_extension_record *,
353 struct dictionary *);
354 static bool parse_long_string_map (struct sfm_reader *,
355 const struct sfm_extension_record *,
356 struct dictionary *);
357 static void parse_value_labels (struct sfm_reader *, struct dictionary *);
358 static struct variable *parse_weight_var (struct sfm_reader *,
359 const struct sfm_var_record *, size_t n_var_recs,
361 static void parse_data_file_attributes (struct sfm_reader *,
362 const struct sfm_extension_record *,
363 struct dictionary *);
364 static void parse_variable_attributes (struct sfm_reader *,
365 const struct sfm_extension_record *,
366 struct dictionary *);
367 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
368 static void parse_long_string_value_labels (struct sfm_reader *,
369 const struct sfm_extension_record *,
370 struct dictionary *);
371 static void parse_long_string_missing_values (
372 struct sfm_reader *, const struct sfm_extension_record *,
373 struct dictionary *);
375 /* Frees the strings inside INFO. */
377 any_read_info_destroy (struct any_read_info *info)
381 free (info->creation_date);
382 free (info->creation_time);
383 free (info->product);
384 free (info->product_ext);
388 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
389 successful, otherwise NULL. */
390 static struct any_reader *
391 sfm_open (struct file_handle *fh)
393 size_t allocated_mrsets = 0;
394 struct sfm_reader *r;
396 /* Create and initialize reader. */
397 r = xzalloc (sizeof *r);
398 r->any_reader.klass = &sys_file_reader_class;
399 r->pool = pool_create ();
400 pool_register (r->pool, free, r);
402 r->opcode_idx = sizeof r->opcodes;
403 ll_init (&r->var_attrs);
405 /* TRANSLATORS: this fragment will be interpolated into
406 messages in fh_lock() that identify types of files. */
407 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
411 r->file = fn_open (fh, "rb");
414 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
415 fh_get_file_name (r->fh), strerror (errno));
419 if (!read_dictionary (r))
422 if (r->extensions[EXT_MRSETS] != NULL)
423 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
425 if (r->extensions[EXT_MRSETS2] != NULL)
426 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
428 return &r->any_reader;
432 sfm_close (&r->any_reader);
437 read_dictionary (struct sfm_reader *r)
439 size_t allocated_vars;
440 size_t allocated_labels;
442 if (!read_header (r, &r->info, &r->header))
446 allocated_labels = 0;
451 if (!read_int (r, &type))
455 if (!read_record (r, type, &allocated_vars, &allocated_labels))
459 if (!skip_bytes (r, 4))
462 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
469 read_record (struct sfm_reader *r, int type,
470 size_t *allocated_vars, size_t *allocated_labels)
477 if (r->n_vars >= *allocated_vars)
478 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
480 return read_variable_record (r, &r->vars[r->n_vars++]);
483 if (r->n_labels >= *allocated_labels)
484 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
486 return read_value_label_record (r, &r->labels[r->n_labels++]);
489 /* A Type 4 record is always immediately after a type 3 record,
490 so the code for type 3 records reads the type 4 record too. */
491 sys_error (r, r->pos, _("Misplaced type 4 record."));
495 if (r->document != NULL)
497 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
500 return read_document_record (r);
503 if (!read_int (r, &subtype))
506 || subtype >= sizeof r->extensions / sizeof *r->extensions)
509 _("Unrecognized record type 7, subtype %d. For help, "
510 "please send this file to %s and mention that you were "
512 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
513 return skip_extension_record (r, subtype);
515 else if (subtype == 18)
517 /* System files written by "Stata 14.1/-savespss- 1.77 by S.Radyakin"
518 put each variable attribute into a separate record with subtype
519 18. I'm surprised that SPSS puts up with this. */
520 struct sfm_extension_record *ext;
521 bool ok = read_extension_record (r, subtype, &ext);
523 ll_push_tail (&r->var_attrs, &ext->ll);
526 else if (r->extensions[subtype] != NULL)
529 _("Record type 7, subtype %d found here has the same "
530 "type as the record found near offset 0x%llx. For "
531 "help, please send this file to %s and mention that "
532 "you were using %s."),
533 subtype, (long long int) r->extensions[subtype]->pos,
534 PACKAGE_BUGREPORT, PACKAGE_STRING);
535 return skip_extension_record (r, subtype);
538 return read_extension_record (r, subtype, &r->extensions[subtype]);
541 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
548 /* Returns the character encoding obtained from R, or a null pointer if R
549 doesn't have an indication of its character encoding. */
551 sfm_get_encoding (const struct sfm_reader *r)
553 /* The EXT_ENCODING record is the best way to determine dictionary
555 if (r->extensions[EXT_ENCODING])
556 return r->extensions[EXT_ENCODING]->data;
558 /* But EXT_INTEGER is better than nothing as a fallback. */
559 if (r->extensions[EXT_INTEGER])
561 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
562 const char *encoding;
571 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
572 respectively. However, many files have character code 2 but data
573 which are clearly not ASCII. Therefore, ignore these values. */
580 encoding = sys_get_encoding_from_codepage (codepage);
581 if (encoding != NULL)
587 /* If the file magic number is EBCDIC then its character data is too. */
588 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
594 struct get_strings_aux
605 add_string__ (struct get_strings_aux *aux,
606 const char *string, bool id, char *title)
608 if (aux->n >= aux->allocated)
610 aux->allocated = 2 * (aux->allocated + 1);
611 aux->titles = pool_realloc (aux->pool, aux->titles,
612 aux->allocated * sizeof *aux->titles);
613 aux->strings = pool_realloc (aux->pool, aux->strings,
614 aux->allocated * sizeof *aux->strings);
615 aux->ids = pool_realloc (aux->pool, aux->ids,
616 aux->allocated * sizeof *aux->ids);
619 aux->titles[aux->n] = title;
620 aux->strings[aux->n] = pool_strdup (aux->pool, string);
621 aux->ids[aux->n] = id;
625 static void PRINTF_FORMAT (3, 4)
626 add_string (struct get_strings_aux *aux,
627 const char *string, const char *title, ...)
631 va_start (args, title);
632 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
636 static void PRINTF_FORMAT (3, 4)
637 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
641 va_start (args, title);
642 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
646 /* Retrieves significant string data from R in its raw format, to allow the
647 caller to try to detect the encoding in use.
649 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
650 and *STRINGSP to an array of N elements allocated from POOL. For each I in
651 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
652 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
653 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
656 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
657 char ***titlesp, bool **idsp, char ***stringsp)
659 struct sfm_reader *r = sfm_reader_cast (r_);
660 const struct sfm_mrset *mrset;
661 struct get_strings_aux aux;
673 for (i = 0; i < r->n_vars; i++)
674 if (r->vars[i].width != -1)
675 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
678 for (i = 0; i < r->n_vars; i++)
679 if (r->vars[i].width != -1)
682 if (r->vars[i].label)
683 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
688 for (i = 0; i < r->n_labels; i++)
689 for (j = 0; j < r->labels[i].n_labels; j++)
690 add_string (&aux, r->labels[i].labels[j].label,
691 _("Value Label %zu"), k++);
693 add_string (&aux, r->header.creation_date, _("Creation Date"));
694 add_string (&aux, r->header.creation_time, _("Creation Time"));
695 add_string (&aux, r->header.eye_catcher, _("Product"));
696 add_string (&aux, r->header.file_label, _("File Label"));
698 if (r->extensions[EXT_PRODUCT_INFO])
699 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
700 _("Extra Product Info"));
706 for (i = 0; i < r->document->n_lines; i++)
710 memcpy (line, r->document->documents + i * 80, 80);
713 add_string (&aux, line, _("Document Line %zu"), i + 1);
717 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
719 size_t mrset_idx = mrset - r->mrsets + 1;
721 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
723 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
725 /* Skip the variables because they ought to be duplicates. */
728 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
732 /* data file attributes */
733 /* variable attributes */
735 /* long string value labels */
736 /* long string missing values */
738 *titlesp = aux.titles;
740 *stringsp = aux.strings;
744 /* Decodes the dictionary read from R, saving it into into *DICT. Character
745 strings in R are decoded using ENCODING, or an encoding obtained from R if
746 ENCODING is null, or the locale encoding if R specifies no encoding.
748 If INFOP is non-null, then it receives additional info about the system
749 file, which the caller must eventually free with any_read_info_destroy()
750 when it is no longer needed.
752 This function consumes R. The caller must use it again later, even to
753 destroy it with sfm_close(). */
754 static struct casereader *
755 sfm_decode (struct any_reader *r_, const char *encoding,
756 struct dictionary **dictp, struct any_read_info *infop)
758 struct sfm_reader *r = sfm_reader_cast (r_);
759 struct dictionary *dict;
761 if (encoding == NULL)
763 encoding = sfm_get_encoding (r);
764 if (encoding == NULL)
766 sys_warn (r, -1, _("This system file does not indicate its own "
767 "character encoding. Using default encoding "
768 "%s. For best results, specify an encoding "
769 "explicitly. Use SYSFILE INFO with "
770 "ENCODING=\"DETECT\" to analyze the possible "
773 encoding = locale_charset ();
777 dict = dict_create (encoding);
778 r->encoding = dict_get_encoding (dict);
780 /* These records don't use variables at all. */
781 if (r->document != NULL)
782 parse_document (dict, r->document);
784 if (r->extensions[EXT_INTEGER] != NULL
785 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
788 if (r->extensions[EXT_FLOAT] != NULL)
789 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
791 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
792 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
794 if (r->extensions[EXT_FILE_ATTRS] != NULL)
795 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
797 parse_header (r, &r->header, &r->info, dict);
799 /* Parse the variable records, the basis of almost everything else. */
800 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
803 /* Parse value labels and the weight variable immediately after the variable
804 records. These records use indexes into var_recs[], so we must parse them
805 before those indexes become invalidated by very long string variables. */
806 parse_value_labels (r, dict);
807 if (r->header.weight_idx != 0)
808 dict_set_weight (dict, parse_weight_var (r, r->vars, r->n_vars,
809 r->header.weight_idx));
811 if (r->extensions[EXT_DISPLAY] != NULL)
812 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
814 /* The following records use short names, so they need to be parsed before
815 parse_long_var_name_map() changes short names to long names. */
816 decode_mrsets (r, dict);
818 if (r->extensions[EXT_LONG_STRINGS] != NULL
819 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
822 /* Now rename variables to their long names. */
823 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
825 /* The following records use long names, so they need to follow renaming. */
826 if (!ll_is_empty (&r->var_attrs))
828 struct sfm_extension_record *ext;
829 ll_for_each (ext, struct sfm_extension_record, ll, &r->var_attrs)
830 parse_variable_attributes (r, ext, dict);
832 /* Roles use the $@Role attribute. */
833 assign_variable_roles (r, dict);
835 if (r->extensions[EXT_LONG_LABELS] != NULL)
836 parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
837 if (r->extensions[EXT_LONG_MISSING] != NULL)
838 parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
841 /* Warn if the actual amount of data per case differs from the
842 amount that the header claims. SPSS version 13 gets this
843 wrong when very long strings are involved, so don't warn in
845 if (r->header.nominal_case_size > 0
846 && r->header.nominal_case_size != r->n_vars
847 && r->info.version_major != 13)
848 sys_warn (r, -1, _("File header claims %d variable positions but "
849 "%zu were read from file."),
850 r->header.nominal_case_size, r->n_vars);
852 /* Create an index of dictionary variable widths for
853 sfm_read_case to use. We cannot use the `struct variable's
854 from the dictionary we created, because the caller owns the
855 dictionary and may destroy or modify its variables. */
856 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
857 pool_register (r->pool, free, r->sfm_vars);
858 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
864 memset (&r->info, 0, sizeof r->info);
867 return casereader_create_sequential
869 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
870 &sys_file_casereader_class, r);
879 /* Closes R, which should have been returned by sfm_open() but not already
880 closed with sfm_decode() or this function.
881 Returns true if an I/O error has occurred on READER, false
884 sfm_close (struct any_reader *r_)
886 struct sfm_reader *r = sfm_reader_cast (r_);
891 if (fn_close (r->fh, r->file) == EOF)
893 msg (ME, _("Error closing system file `%s': %s."),
894 fh_get_file_name (r->fh), strerror (errno));
900 any_read_info_destroy (&r->info);
905 pool_destroy (r->pool);
910 /* Destroys READER. */
912 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
914 struct sfm_reader *r = r_;
915 sfm_close (&r->any_reader);
918 /* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
919 a negative errno value if there is an error reading FILE. */
921 sfm_detect (FILE *file)
925 if (fseek (file, 0, SEEK_SET) != 0)
927 if (fread (magic, 4, 1, file) != 1)
928 return ferror (file) ? -errno : 0;
931 return (!strcmp (ASCII_MAGIC, magic)
932 || !strcmp (ASCII_ZMAGIC, magic)
933 || !strcmp (EBCDIC_MAGIC, magic));
936 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
937 except for the string fields in *INFO, which parse_header() will initialize
938 later once the file's encoding is known. */
940 read_header (struct sfm_reader *r, struct any_read_info *info,
941 struct sfm_header_record *header)
943 uint8_t raw_layout_code[4];
948 if (!read_string (r, header->magic, sizeof header->magic)
949 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
951 r->written_by_readstat = strstr (header->eye_catcher,
952 "https://github.com/WizardMac/ReadStat");
954 if (!strcmp (ASCII_MAGIC, header->magic)
955 || !strcmp (EBCDIC_MAGIC, header->magic))
957 else if (!strcmp (ASCII_ZMAGIC, header->magic))
961 sys_error (r, 0, _("This is not an SPSS system file."));
965 /* Identify integer format. */
966 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
968 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
970 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
972 || (r->integer_format != INTEGER_MSB_FIRST
973 && r->integer_format != INTEGER_LSB_FIRST))
975 sys_error (r, 64, _("This is not an SPSS system file."));
979 if (!read_int (r, &header->nominal_case_size))
982 if (header->nominal_case_size < 0
983 || header->nominal_case_size > INT_MAX / 16)
984 header->nominal_case_size = -1;
986 if (!read_int (r, &compressed))
991 r->compression = ANY_COMP_NONE;
992 else if (compressed == 1)
993 r->compression = ANY_COMP_SIMPLE;
994 else if (compressed != 0)
996 sys_error (r, 0, "System file header has invalid compression "
997 "value %d.", compressed);
1003 if (compressed == 2)
1004 r->compression = ANY_COMP_ZLIB;
1007 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1008 "compression value %d.", compressed);
1013 if (!read_int (r, &header->weight_idx))
1016 if (!read_int (r, &r->case_cnt))
1018 if (r->case_cnt > INT_MAX / 2)
1021 /* Identify floating-point format and obtain compression bias. */
1022 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1024 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1026 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1028 if (memcmp (raw_bias, zero_bias, 8))
1029 sys_warn (r, r->pos - 8,
1030 _("Compression bias is not the usual "
1031 "value of 100, or system file uses unrecognized "
1032 "floating-point format."));
1035 /* Some software is known to write all-zeros to this
1036 field. Such software also writes floating-point
1037 numbers in the format that we expect by default
1038 (it seems that all software most likely does, in
1039 reality), so don't warn in this case. */
1042 if (r->integer_format == INTEGER_MSB_FIRST)
1043 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1045 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1047 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1049 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1050 || !read_string (r, header->creation_time, sizeof header->creation_time)
1051 || !read_string (r, header->file_label, sizeof header->file_label)
1052 || !skip_bytes (r, 3))
1055 info->integer_format = r->integer_format;
1056 info->float_format = r->float_format;
1057 info->compression = r->compression;
1058 info->case_cnt = r->case_cnt;
1063 /* Reads a variable (type 2) record from R into RECORD. */
1065 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1067 int has_variable_label;
1069 memset (record, 0, sizeof *record);
1071 record->pos = r->pos;
1072 if (!read_int (r, &record->width)
1073 || !read_int (r, &has_variable_label)
1074 || !read_int (r, &record->missing_value_code)
1075 || !read_int (r, &record->print_format)
1076 || !read_int (r, &record->write_format)
1077 || !read_string (r, record->name, sizeof record->name))
1080 if (has_variable_label == 1)
1082 enum { MAX_LABEL_LEN = 65536 };
1083 unsigned int len, read_len;
1085 if (!read_uint (r, &len))
1088 /* Read up to MAX_LABEL_LEN bytes of label. */
1089 read_len = MIN (MAX_LABEL_LEN, len);
1090 record->label = pool_malloc (r->pool, read_len + 1);
1091 if (!read_string (r, record->label, read_len + 1))
1094 /* Skip unread label bytes. */
1095 if (!skip_bytes (r, len - read_len))
1098 /* Skip label padding up to multiple of 4 bytes. */
1099 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1102 else if (has_variable_label != 0)
1104 sys_error (r, record->pos,
1105 _("Variable label indicator field is not 0 or 1."));
1109 /* Set missing values. */
1110 if (record->missing_value_code != 0)
1112 int code = record->missing_value_code;
1113 if (record->width == 0)
1115 if (code < -3 || code > 3 || code == -1)
1117 sys_error (r, record->pos,
1118 _("Numeric missing value indicator field is not "
1119 "-3, -2, 0, 1, 2, or 3."));
1125 if (code < 1 || code > 3)
1127 sys_error (r, record->pos,
1128 _("String missing value indicator field is not "
1134 if (!read_bytes (r, record->missing, 8 * abs (code)))
1141 /* Reads value labels from R into RECORD. */
1143 read_value_label_record (struct sfm_reader *r,
1144 struct sfm_value_label_record *record)
1149 /* Read type 3 record. */
1150 record->pos = r->pos;
1151 if (!read_uint (r, &record->n_labels))
1153 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1155 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1159 record->labels = pool_nmalloc (r->pool, record->n_labels,
1160 sizeof *record->labels);
1161 for (i = 0; i < record->n_labels; i++)
1163 struct sfm_value_label *label = &record->labels[i];
1164 unsigned char label_len;
1167 if (!read_bytes (r, label->value, sizeof label->value))
1170 /* Read label length. */
1171 if (!read_bytes (r, &label_len, sizeof label_len))
1173 padded_len = ROUND_UP (label_len + 1, 8);
1175 /* Read label, padding. */
1176 label->label = pool_malloc (r->pool, padded_len + 1);
1177 if (!read_bytes (r, label->label, padded_len - 1))
1179 label->label[label_len] = '\0';
1182 /* Read record type of type 4 record. */
1183 if (!read_int (r, &type))
1187 sys_error (r, r->pos - 4,
1188 _("Variable index record (type 4) does not immediately "
1189 "follow value label record (type 3) as it should."));
1193 /* Read number of variables associated with value label from type 4
1195 if (!read_uint (r, &record->n_vars))
1197 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1199 sys_error (r, r->pos - 4,
1200 _("Number of variables associated with a value label (%u) "
1201 "is not between 1 and the number of variables (%zu)."),
1202 record->n_vars, r->n_vars);
1206 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1207 for (i = 0; i < record->n_vars; i++)
1208 if (!read_int (r, &record->vars[i]))
1214 /* Reads a document record from R. Returns true if successful, false on
1217 read_document_record (struct sfm_reader *r)
1220 if (!read_int (r, &n_lines))
1222 else if (n_lines == 0)
1224 else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1226 sys_error (r, r->pos,
1227 _("Number of document lines (%d) "
1228 "must be greater than 0 and less than %d."),
1229 n_lines, INT_MAX / DOC_LINE_LENGTH);
1233 struct sfm_document_record *record;
1234 record = pool_malloc (r->pool, sizeof *record);
1235 record->pos = r->pos;
1236 record->n_lines = n_lines;
1237 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1238 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1241 r->document = record;
1246 read_extension_record_header (struct sfm_reader *r, int subtype,
1247 struct sfm_extension_record *record)
1249 record->subtype = subtype;
1250 record->pos = r->pos;
1251 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1254 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1255 allows an extra byte for a null terminator, used by some
1256 extension processing routines. */
1257 if (record->size != 0
1258 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1260 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1268 /* Reads an extension record from R into RECORD. */
1270 read_extension_record (struct sfm_reader *r, int subtype,
1271 struct sfm_extension_record **recordp)
1273 struct extension_record_type
1280 static const struct extension_record_type types[] =
1282 /* Implemented record types. */
1283 { EXT_INTEGER, 4, 8 },
1284 { EXT_FLOAT, 8, 3 },
1285 { EXT_MRSETS, 1, 0 },
1286 { EXT_PRODUCT_INFO, 1, 0 },
1287 { EXT_DISPLAY, 4, 0 },
1288 { EXT_LONG_NAMES, 1, 0 },
1289 { EXT_LONG_STRINGS, 1, 0 },
1290 { EXT_NCASES, 8, 2 },
1291 { EXT_FILE_ATTRS, 1, 0 },
1292 { EXT_VAR_ATTRS, 1, 0 },
1293 { EXT_MRSETS2, 1, 0 },
1294 { EXT_ENCODING, 1, 0 },
1295 { EXT_LONG_LABELS, 1, 0 },
1296 { EXT_LONG_MISSING, 1, 0 },
1298 /* Ignored record types. */
1299 { EXT_VAR_SETS, 0, 0 },
1301 { EXT_DATA_ENTRY, 0, 0 },
1302 { EXT_DATAVIEW, 0, 0 },
1305 const struct extension_record_type *type;
1306 struct sfm_extension_record *record;
1310 record = pool_malloc (r->pool, sizeof *record);
1311 if (!read_extension_record_header (r, subtype, record))
1313 n_bytes = record->count * record->size;
1315 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1316 if (subtype == type->subtype)
1318 if (type->size > 0 && record->size != type->size)
1319 sys_warn (r, record->pos,
1320 _("Record type 7, subtype %d has bad size %u "
1321 "(expected %d)."), subtype, record->size, type->size);
1322 else if (type->count > 0 && record->count != type->count)
1323 sys_warn (r, record->pos,
1324 _("Record type 7, subtype %d has bad count %u "
1325 "(expected %d)."), subtype, record->count, type->count);
1326 else if (type->count == 0 && type->size == 0)
1328 /* Ignore this record. */
1332 char *data = pool_malloc (r->pool, n_bytes + 1);
1333 data[n_bytes] = '\0';
1335 record->data = data;
1336 if (!read_bytes (r, record->data, n_bytes))
1345 sys_warn (r, record->pos,
1346 _("Unrecognized record type 7, subtype %d. For help, please "
1347 "send this file to %s and mention that you were using %s."),
1348 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
1351 return skip_bytes (r, n_bytes);
1355 skip_extension_record (struct sfm_reader *r, int subtype)
1357 struct sfm_extension_record record;
1359 return (read_extension_record_header (r, subtype, &record)
1360 && skip_bytes (r, record.count * record.size));
1364 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1365 struct any_read_info *info, struct dictionary *dict)
1367 const char *dict_encoding = dict_get_encoding (dict);
1368 struct substring product;
1369 struct substring label;
1372 /* Convert file label to UTF-8 and put it into DICT. */
1373 label = recode_substring_pool ("UTF-8", dict_encoding,
1374 ss_cstr (header->file_label), r->pool);
1375 ss_trim (&label, ss_cstr (" "));
1376 label.string[label.length] = '\0';
1377 fixed_label = fix_line_ends (label.string);
1378 dict_set_label (dict, fixed_label);
1381 /* Put creation date and time in UTF-8 into INFO. */
1382 info->creation_date = recode_string ("UTF-8", dict_encoding,
1383 header->creation_date, -1);
1384 info->creation_time = recode_string ("UTF-8", dict_encoding,
1385 header->creation_time, -1);
1387 /* Put product name into INFO, dropping eye-catcher string if present. */
1388 product = recode_substring_pool ("UTF-8", dict_encoding,
1389 ss_cstr (header->eye_catcher), r->pool);
1390 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1391 ss_trim (&product, ss_cstr (" "));
1392 info->product = ss_xstrdup (product);
1395 static struct variable *
1396 add_var_with_generated_name (struct dictionary *dict, int width)
1398 char *name = dict_make_unique_var_name (dict, NULL, NULL);
1399 struct variable *var = dict_create_var_assert (dict, name, width);
1404 /* Reads a variable (type 2) record from R and adds the
1405 corresponding variable to DICT.
1406 Also skips past additional variable records for long string
1409 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1410 struct sfm_var_record *var_recs, size_t n_var_recs)
1412 const char *dict_encoding = dict_get_encoding (dict);
1413 struct sfm_var_record *rec;
1416 for (rec = var_recs; rec < &var_recs[n_var_recs];)
1422 name = recode_string_pool ("UTF-8", dict_encoding,
1423 rec->name, -1, r->pool);
1424 name[strcspn (name, " ")] = '\0';
1426 if (rec->width < 0 || rec->width > 255)
1428 sys_error (r, rec->pos,
1429 _("Bad width %d for variable %s."), rec->width, name);
1433 struct variable *var;
1434 if (!dict_id_is_valid (dict, name, false)
1435 || name[0] == '$' || name[0] == '#')
1437 var = add_var_with_generated_name (dict, rec->width);
1438 sys_warn (r, rec->pos, _("Renaming variable with invalid name "
1439 "`%s' to `%s'."), name, var_get_name (var));
1443 var = dict_create_var (dict, name, rec->width);
1446 var = add_var_with_generated_name (dict, rec->width);
1447 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1449 name, var_get_name (var));
1454 /* Set the short name the same as the long name (even if we renamed
1456 var_set_short_name (var, 0, var_get_name (var));
1458 /* Get variable label, if any. */
1463 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1464 rec->label, -1, r->pool);
1465 var_set_label (var, utf8_label);
1468 /* Set missing values. */
1469 if (rec->missing_value_code != 0)
1471 int width = var_get_width (var);
1472 struct missing_values mv;
1474 mv_init_pool (r->pool, &mv, width);
1475 if (var_is_numeric (var))
1477 bool has_range = rec->missing_value_code < 0;
1478 int n_discrete = (has_range
1479 ? rec->missing_value_code == -3
1480 : rec->missing_value_code);
1485 double low = parse_float (r, rec->missing, 0);
1486 double high = parse_float (r, rec->missing, 8);
1488 /* Deal with SPSS 21 change in representation. */
1492 mv_add_range (&mv, low, high);
1496 for (i = 0; i < n_discrete; i++)
1498 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1503 for (i = 0; i < rec->missing_value_code; i++)
1504 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1505 var_set_missing_values (var, &mv);
1509 parse_format_spec (r, rec->pos + 12, rec->print_format,
1510 PRINT_FORMAT, var, &n_warnings);
1511 parse_format_spec (r, rec->pos + 16, rec->write_format,
1512 WRITE_FORMAT, var, &n_warnings);
1514 /* Account for values.
1515 Skip long string continuation records, if any. */
1516 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1517 for (i = 1; i < n_values; i++)
1518 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1520 sys_error (r, rec->pos, _("Missing string continuation record."));
1529 /* Translates the format spec from sysfile format to internal
1532 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1533 enum which_format which, struct variable *v,
1536 const int max_warnings = 8;
1539 if (fmt_from_u32 (format, var_get_width (v), false, &f))
1541 if (which == PRINT_FORMAT)
1542 var_set_print_format (v, &f);
1544 var_set_write_format (v, &f);
1546 else if (format == 0)
1548 /* Actually observed in the wild. No point in warning about it. */
1550 else if (++*n_warnings <= max_warnings)
1552 if (which == PRINT_FORMAT)
1553 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1555 var_get_name (v), var_get_width (v), format);
1557 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1559 var_get_name (v), var_get_width (v), format);
1561 if (*n_warnings == max_warnings)
1562 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1567 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1571 for (p = record->documents;
1572 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1573 p += DOC_LINE_LENGTH)
1575 struct substring line;
1577 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1578 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1579 ss_rtrim (&line, ss_cstr (" "));
1580 line.string[line.length] = '\0';
1582 dict_add_document_line (dict, line.string, false);
1588 /* Parses record type 7, subtype 3. */
1590 parse_machine_integer_info (struct sfm_reader *r,
1591 const struct sfm_extension_record *record,
1592 struct any_read_info *info)
1594 int float_representation, expected_float_format;
1595 int integer_representation, expected_integer_format;
1597 /* Save version info. */
1598 info->version_major = parse_int (r, record->data, 0);
1599 info->version_minor = parse_int (r, record->data, 4);
1600 info->version_revision = parse_int (r, record->data, 8);
1602 /* Check floating point format. */
1603 float_representation = parse_int (r, record->data, 16);
1604 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1605 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1606 expected_float_format = 1;
1607 else if (r->float_format == FLOAT_Z_LONG)
1608 expected_float_format = 2;
1609 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1610 expected_float_format = 3;
1613 if (float_representation != expected_float_format)
1615 sys_error (r, record->pos,
1616 _("Floating-point representation indicated by "
1617 "system file (%d) differs from expected (%d)."),
1618 float_representation, expected_float_format);
1622 /* Check integer format. */
1623 integer_representation = parse_int (r, record->data, 24);
1624 if (r->integer_format == INTEGER_MSB_FIRST)
1625 expected_integer_format = 1;
1626 else if (r->integer_format == INTEGER_LSB_FIRST)
1627 expected_integer_format = 2;
1630 if (integer_representation != expected_integer_format)
1631 sys_warn (r, record->pos,
1632 _("Integer format indicated by system file (%d) "
1633 "differs from expected (%d)."),
1634 integer_representation, expected_integer_format);
1639 /* Parses record type 7, subtype 4. */
1641 parse_machine_float_info (struct sfm_reader *r,
1642 const struct sfm_extension_record *record)
1644 double sysmis = parse_float (r, record->data, 0);
1645 double highest = parse_float (r, record->data, 8);
1646 double lowest = parse_float (r, record->data, 16);
1648 if (sysmis != SYSMIS)
1649 sys_warn (r, record->pos,
1650 _("File specifies unexpected value %g (%a) as %s, "
1651 "instead of %g (%a)."),
1652 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1654 if (highest != HIGHEST)
1655 sys_warn (r, record->pos,
1656 _("File specifies unexpected value %g (%a) as %s, "
1657 "instead of %g (%a)."),
1658 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1660 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1661 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1662 appears in a context (missing values) where SYSMIS cannot. */
1663 if (lowest != LOWEST && lowest != SYSMIS)
1664 sys_warn (r, record->pos,
1665 _("File specifies unexpected value %g (%a) as %s, "
1666 "instead of %g (%a) or %g (%a)."),
1667 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1670 /* Parses record type 7, subtype 10. */
1672 parse_extra_product_info (struct sfm_reader *r,
1673 const struct sfm_extension_record *record,
1674 struct any_read_info *info)
1676 struct text_record *text;
1678 text = open_text_record (r, record, true);
1679 info->product_ext = fix_line_ends (text_get_all (text));
1680 close_text_record (r, text);
1683 /* Parses record type 7, subtype 7 or 19. */
1685 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1686 size_t *allocated_mrsets)
1688 struct text_record *text;
1690 text = open_text_record (r, record, false);
1693 struct sfm_mrset *mrset;
1694 size_t allocated_vars;
1697 /* Skip extra line feeds if present. */
1698 while (text_match (text, '\n'))
1701 if (r->n_mrsets >= *allocated_mrsets)
1702 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1704 mrset = &r->mrsets[r->n_mrsets];
1705 memset(mrset, 0, sizeof *mrset);
1707 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1708 if (mrset->name == NULL)
1711 if (text_match (text, 'C'))
1713 mrset->type = MRSET_MC;
1714 if (!text_match (text, ' '))
1716 sys_warn (r, record->pos,
1717 _("Missing space following `%c' at offset %zu "
1718 "in MRSETS record."), 'C', text_pos (text));
1722 else if (text_match (text, 'D'))
1724 mrset->type = MRSET_MD;
1725 mrset->cat_source = MRSET_VARLABELS;
1727 else if (text_match (text, 'E'))
1731 mrset->type = MRSET_MD;
1732 mrset->cat_source = MRSET_COUNTEDVALUES;
1733 if (!text_match (text, ' '))
1735 sys_warn (r, record->pos,
1736 _("Missing space following `%c' at offset %zu "
1737 "in MRSETS record."), 'E', text_pos (text));
1741 number = text_get_token (text, ss_cstr (" "), NULL);
1743 sys_warn (r, record->pos,
1744 _("Missing label source value "
1745 "following `E' at offset %zu in MRSETS record."),
1747 else if (!strcmp (number, "11"))
1748 mrset->label_from_var_label = true;
1749 else if (strcmp (number, "1"))
1750 sys_warn (r, record->pos,
1751 _("Unexpected label source value following `E' "
1752 "at offset %zu in MRSETS record."),
1757 sys_warn (r, record->pos,
1758 _("Missing `C', `D', or `E' at offset %zu "
1759 "in MRSETS record."),
1764 if (mrset->type == MRSET_MD)
1766 mrset->counted = text_parse_counted_string (r, text);
1767 if (mrset->counted == NULL)
1771 mrset->label = text_parse_counted_string (r, text);
1772 if (mrset->label == NULL)
1780 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1783 if (delimiter != '\n')
1784 sys_warn (r, record->pos,
1785 _("Missing new-line parsing variable names "
1786 "at offset %zu in MRSETS record."),
1791 if (mrset->n_vars >= allocated_vars)
1792 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1794 sizeof *mrset->vars);
1795 mrset->vars[mrset->n_vars++] = var;
1797 while (delimiter != '\n');
1801 close_text_record (r, text);
1805 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1807 const struct sfm_mrset *s;
1809 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1811 struct stringi_set var_names;
1812 struct mrset *mrset;
1817 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1818 if (!mrset_is_valid_name (name, dict_get_encoding (dict), false))
1820 sys_warn (r, -1, _("Invalid multiple response set name `%s'."),
1826 mrset = xzalloc (sizeof *mrset);
1828 mrset->type = s->type;
1829 mrset->cat_source = s->cat_source;
1830 mrset->label_from_var_label = s->label_from_var_label;
1831 if (s->label[0] != '\0')
1832 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1834 stringi_set_init (&var_names);
1835 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1837 for (i = 0; i < s->n_vars; i++)
1839 struct variable *var;
1842 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1844 var = dict_lookup_var (dict, var_name);
1850 if (!stringi_set_insert (&var_names, var_name))
1853 _("MRSET %s contains duplicate variable name %s."),
1854 mrset->name, var_name);
1860 if (mrset->label == NULL && mrset->label_from_var_label
1861 && var_has_label (var))
1862 mrset->label = xstrdup (var_get_label (var));
1865 && var_get_type (var) != var_get_type (mrset->vars[0]))
1868 _("MRSET %s contains both string and "
1869 "numeric variables."), mrset->name);
1872 width = MIN (width, var_get_width (var));
1874 mrset->vars[mrset->n_vars++] = var;
1877 if (mrset->n_vars < 2)
1879 if (mrset->n_vars == 0)
1880 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1882 sys_warn (r, -1, _("MRSET %s has only one variable."),
1884 mrset_destroy (mrset);
1885 stringi_set_destroy (&var_names);
1889 if (mrset->type == MRSET_MD)
1891 mrset->width = width;
1892 value_init (&mrset->counted, width);
1894 mrset->counted.f = c_strtod (s->counted, NULL);
1896 value_copy_str_rpad (&mrset->counted, width,
1897 (const uint8_t *) s->counted, ' ');
1900 dict_add_mrset (dict, mrset);
1901 stringi_set_destroy (&var_names);
1905 /* Read record type 7, subtype 11, which specifies how variables
1906 should be displayed in GUI environments. */
1908 parse_display_parameters (struct sfm_reader *r,
1909 const struct sfm_extension_record *record,
1910 struct dictionary *dict)
1912 bool includes_width;
1913 bool warned = false;
1918 n_vars = dict_get_var_cnt (dict);
1919 if (record->count == 3 * n_vars)
1920 includes_width = true;
1921 else if (record->count == 2 * n_vars)
1922 includes_width = false;
1925 sys_warn (r, record->pos,
1926 _("Extension 11 has bad count %u (for %zu variables)."),
1927 record->count, n_vars);
1932 for (i = 0; i < n_vars; ++i)
1934 struct variable *v = dict_get_var (dict, i);
1935 int measure, width, align;
1937 measure = parse_int (r, record->data, ofs);
1942 width = parse_int (r, record->data, ofs);
1948 align = parse_int (r, record->data, ofs);
1951 /* SPSS sometimes seems to set variables' measure to zero. */
1955 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1958 sys_warn (r, record->pos,
1959 _("Invalid variable display parameters for variable "
1960 "%zu (%s). Default parameters substituted."),
1961 i, var_get_name (v));
1966 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1967 : measure == 2 ? MEASURE_ORDINAL
1969 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1970 : align == 1 ? ALIGN_RIGHT
1973 /* Older versions (SPSS 9.0) sometimes set the display
1974 width to zero. This causes confusion in the GUI, so
1975 only set the width if it is nonzero. */
1977 var_set_display_width (v, width);
1982 rename_var_and_save_short_names (struct sfm_reader *r, off_t pos,
1983 struct dictionary *dict,
1984 struct variable *var, const char *new_name)
1986 size_t n_short_names;
1990 /* Renaming a variable may clear its short names, but we
1991 want to retain them, so we save them and re-set them
1993 n_short_names = var_get_short_name_cnt (var);
1994 short_names = xnmalloc (n_short_names, sizeof *short_names);
1995 for (i = 0; i < n_short_names; i++)
1997 const char *s = var_get_short_name (var, i);
1998 short_names[i] = s != NULL ? xstrdup (s) : NULL;
2001 /* Set long name. */
2002 if (!dict_try_rename_var (dict, var, new_name))
2003 sys_warn (r, pos, _("Duplicate long variable name `%s'."), new_name);
2005 /* Restore short names. */
2006 for (i = 0; i < n_short_names; i++)
2008 var_set_short_name (var, i, short_names[i]);
2009 free (short_names[i]);
2014 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2015 to each short name. Modifies variable names in DICT accordingly. */
2017 parse_long_var_name_map (struct sfm_reader *r,
2018 const struct sfm_extension_record *record,
2019 struct dictionary *dict)
2021 struct text_record *text;
2022 struct variable *var;
2027 /* There are no long variable names. Use the short variable names,
2028 converted to lowercase, as the long variable names. */
2031 for (i = 0; i < dict_get_var_cnt (dict); i++)
2033 struct variable *var = dict_get_var (dict, i);
2036 new_name = utf8_to_lower (var_get_name (var));
2037 rename_var_and_save_short_names (r, -1, dict, var, new_name);
2044 /* Rename each of the variables, one by one. (In a correctly constructed
2045 system file, this cannot create any intermediate duplicate variable names,
2046 because all of the new variable names are longer than any of the old
2047 variable names and thus there cannot be any overlaps.) */
2048 text = open_text_record (r, record, true);
2049 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2051 /* Validate long name. */
2052 if (!dict_id_is_valid (dict, long_name, false)
2053 || long_name[0] == '$' || long_name[0] == '#')
2055 sys_warn (r, record->pos,
2056 _("Long variable mapping from %s to invalid "
2057 "variable name `%s'."),
2058 var_get_name (var), long_name);
2062 rename_var_and_save_short_names (r, record->pos, dict, var, long_name);
2064 close_text_record (r, text);
2067 /* Reads record type 7, subtype 14, which gives the real length
2068 of each very long string. Rearranges DICT accordingly. */
2070 parse_long_string_map (struct sfm_reader *r,
2071 const struct sfm_extension_record *record,
2072 struct dictionary *dict)
2074 struct text_record *text;
2075 struct variable *var;
2078 text = open_text_record (r, record, true);
2079 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2081 size_t idx = var_get_dict_index (var);
2087 length = strtol (length_s, NULL, 10);
2088 if (length < 1 || length > MAX_STRING)
2090 sys_warn (r, record->pos,
2091 _("%s listed as string of invalid length %s "
2092 "in very long string record."),
2093 var_get_name (var), length_s);
2097 /* Check segments. */
2098 segment_cnt = sfm_width_to_segments (length);
2099 if (segment_cnt == 1)
2101 sys_warn (r, record->pos,
2102 _("%s listed in very long string record with width %s, "
2103 "which requires only one segment."),
2104 var_get_name (var), length_s);
2107 if (idx + segment_cnt > dict_get_var_cnt (dict))
2109 sys_error (r, record->pos,
2110 _("Very long string %s overflows dictionary."),
2111 var_get_name (var));
2115 /* Get the short names from the segments and check their
2117 for (i = 0; i < segment_cnt; i++)
2119 struct variable *seg = dict_get_var (dict, idx + i);
2120 int alloc_width = sfm_segment_alloc_width (length, i);
2121 int width = var_get_width (seg);
2124 var_set_short_name (var, i, var_get_short_name (seg, 0));
2125 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2127 sys_error (r, record->pos,
2128 _("Very long string with width %ld has segment %d "
2129 "of width %d (expected %d)."),
2130 length, i, width, alloc_width);
2134 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
2135 var_set_width (var, length);
2137 close_text_record (r, text);
2138 dict_compact_values (dict);
2143 #define MAX_LABEL_WARNINGS 5
2145 /* Displays a warning for offset OFFSET in the file. */
2147 value_label_warning (struct sfm_reader *r, off_t offset, int *n_label_warnings,
2148 const char *format, ...)
2150 if (++*n_label_warnings > MAX_LABEL_WARNINGS)
2155 va_start (args, format);
2156 sys_msg (r, offset, MW, format, args);
2160 #define MAX_LABEL_WARNINGS 5
2163 parse_one_value_label_set (struct sfm_reader *r, struct dictionary *dict,
2164 const struct sfm_var_record *var_recs,
2166 const struct sfm_value_label_record *record,
2167 int *n_label_warnings)
2170 = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2171 for (size_t i = 0; i < record->n_labels; i++)
2172 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2173 record->labels[i].label, -1,
2176 struct variable **vars = pool_nmalloc (r->pool,
2177 record->n_vars, sizeof *vars);
2178 unsigned int n_vars = 0;
2179 for (size_t i = 0; i < record->n_vars; i++)
2181 int idx = record->vars[i];
2182 if (idx < 1 || idx > n_var_recs)
2184 value_label_warning (
2185 r, record->pos, n_label_warnings,
2186 _("Value label variable index %d not in valid range 1...%zu."),
2191 const struct sfm_var_record *rec = &var_recs[idx - 1];
2192 if (rec->var == NULL)
2194 value_label_warning (
2195 r, record->pos, n_label_warnings,
2196 _("Value label variable index %d "
2197 "refers to long string continuation."), idx);
2201 vars[n_vars++] = rec->var;
2206 for (size_t i = 1; i < n_vars; i++)
2207 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2209 value_label_warning (
2210 r, record->pos, n_label_warnings,
2211 _("Variables associated with value label are not all of "
2212 "identical type. Variable %s is %s, but variable "
2214 var_get_name (vars[0]),
2215 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2216 var_get_name (vars[i]),
2217 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2221 for (size_t i = 0; i < n_vars; i++)
2223 struct variable *var = vars[i];
2224 int width = var_get_width (var);
2227 value_label_warning (
2228 r, record->pos, n_label_warnings,
2229 _("Value labels may not be added to long string "
2230 "variables (e.g. %s) using records types 3 and 4."),
2231 var_get_name (var));
2235 for (size_t j = 0; j < record->n_labels; j++)
2237 struct sfm_value_label *label = &record->labels[j];
2240 value_init (&value, width);
2242 value.f = parse_float (r, label->value, 0);
2244 memcpy (value.s, label->value, width);
2246 if (!var_add_value_label (var, &value, utf8_labels[j]))
2248 if (r->written_by_readstat)
2250 /* Ignore the problem. ReadStat is buggy and emits value
2251 labels whose values are longer than string variables'
2252 widths, that are identical in the actual width of the
2253 variable, e.g. both values "ABC123" and "ABC456" for a
2254 string variable with width 3. */
2256 else if (var_is_numeric (var))
2257 value_label_warning (r, record->pos, n_label_warnings,
2258 _("Duplicate value label for %g on %s."),
2259 value.f, var_get_name (var));
2261 value_label_warning (
2262 r, record->pos, n_label_warnings,
2263 _("Duplicate value label for `%.*s' on %s."),
2264 width, value.s, var_get_name (var));
2267 value_destroy (&value, width);
2271 pool_free (r->pool, vars);
2272 for (size_t i = 0; i < record->n_labels; i++)
2273 pool_free (r->pool, utf8_labels[i]);
2274 pool_free (r->pool, utf8_labels);
2278 parse_value_labels (struct sfm_reader *r, struct dictionary *dict)
2280 int n_label_warnings = 0;
2281 for (size_t i = 0; i < r->n_labels; i++)
2282 parse_one_value_label_set (r, dict, r->vars, r->n_vars, &r->labels[i],
2284 if (n_label_warnings > MAX_LABEL_WARNINGS)
2286 _("Suppressed %d additional warnings for value labels."),
2287 n_label_warnings - MAX_LABEL_WARNINGS);
2290 static struct variable *
2291 parse_weight_var (struct sfm_reader *r,
2292 const struct sfm_var_record *var_recs, size_t n_var_recs,
2295 off_t offset = 76; /* Offset to variable index in header. */
2297 if (idx < 1 || idx > n_var_recs)
2299 sys_warn (r, offset,
2300 _("Weight variable index %d not in valid range 1...%zu. "
2301 "Treating file as unweighted."),
2306 const struct sfm_var_record *rec = &var_recs[idx - 1];
2307 if (rec->var == NULL)
2309 sys_warn (r, offset,
2310 _("Weight variable index %d refers to long string "
2311 "continuation. Treating file as unweighted."), idx);
2315 struct variable *weight_var = rec->var;
2316 if (!var_is_numeric (weight_var))
2318 sys_warn (r, offset, _("Ignoring string variable `%s' set "
2319 "as weighting variable."),
2320 var_get_name (weight_var));
2327 /* Parses a set of custom attributes from TEXT into ATTRS.
2328 ATTRS may be a null pointer, in which case the attributes are
2329 read but discarded. */
2331 parse_attributes (struct sfm_reader *r, struct text_record *text,
2332 struct attrset *attrs)
2336 struct attribute *attr;
2340 /* Parse the key. */
2341 key = text_get_token (text, ss_cstr ("("), NULL);
2345 attr = attribute_create (key);
2346 for (index = 1; ; index++)
2348 /* Parse the value. */
2352 value = text_get_token (text, ss_cstr ("\n"), NULL);
2355 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2360 length = strlen (value);
2361 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2363 value[length - 1] = '\0';
2364 attribute_add_value (attr, value + 1);
2369 _("Attribute value %s[%d] is not quoted: %s."),
2371 attribute_add_value (attr, value);
2374 /* Was this the last value for this attribute? */
2375 if (text_match (text, ')'))
2378 if (attrs != NULL && attribute_get_n_values (attr) > 0)
2380 if (!attrset_try_add (attrs, attr))
2382 text_warn (r, text, _("Duplicate attribute %s."),
2383 attribute_get_name (attr));
2384 attribute_destroy (attr);
2388 attribute_destroy (attr);
2390 while (!text_match (text, '/'));
2393 /* Reads record type 7, subtype 17, which lists custom
2394 attributes on the data file. */
2396 parse_data_file_attributes (struct sfm_reader *r,
2397 const struct sfm_extension_record *record,
2398 struct dictionary *dict)
2400 struct text_record *text = open_text_record (r, record, true);
2401 parse_attributes (r, text, dict_get_attributes (dict));
2402 close_text_record (r, text);
2405 /* Parses record type 7, subtype 18, which lists custom
2406 attributes on individual variables. */
2408 parse_variable_attributes (struct sfm_reader *r,
2409 const struct sfm_extension_record *record,
2410 struct dictionary *dict)
2412 struct text_record *text;
2413 struct variable *var;
2415 text = open_text_record (r, record, true);
2416 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2417 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2418 close_text_record (r, text);
2422 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2424 size_t n_warnings = 0;
2427 for (i = 0; i < dict_get_var_cnt (dict); i++)
2429 struct variable *var = dict_get_var (dict, i);
2430 struct attrset *attrs = var_get_attributes (var);
2431 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2432 if (attr != NULL && attribute_get_n_values (attr) > 0)
2434 int value = atoi (attribute_get_value (attr, 0));
2456 role = ROLE_PARTITION;
2465 if (n_warnings++ == 0)
2466 sys_warn (r, -1, _("Invalid role for variable %s."),
2467 var_get_name (var));
2470 var_set_role (var, role);
2475 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2480 check_overflow (struct sfm_reader *r,
2481 const struct sfm_extension_record *record,
2482 size_t ofs, size_t length)
2484 size_t end = record->size * record->count;
2485 if (length >= end || ofs + length > end)
2487 sys_warn (r, record->pos + end,
2488 _("Extension record subtype %d ends unexpectedly."),
2496 parse_long_string_value_labels (struct sfm_reader *r,
2497 const struct sfm_extension_record *record,
2498 struct dictionary *dict)
2500 const char *dict_encoding = dict_get_encoding (dict);
2501 size_t end = record->size * record->count;
2508 struct variable *var;
2513 /* Parse variable name length. */
2514 if (!check_overflow (r, record, ofs, 4))
2516 var_name_len = parse_int (r, record->data, ofs);
2519 /* Parse variable name, width, and number of labels. */
2520 if (!check_overflow (r, record, ofs, var_name_len)
2521 || !check_overflow (r, record, ofs, var_name_len + 8))
2523 var_name = recode_string_pool ("UTF-8", dict_encoding,
2524 (const char *) record->data + ofs,
2525 var_name_len, r->pool);
2526 width = parse_int (r, record->data, ofs + var_name_len);
2527 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2528 ofs += var_name_len + 8;
2530 /* Look up 'var' and validate. */
2531 var = dict_lookup_var (dict, var_name);
2533 sys_warn (r, record->pos + ofs,
2534 _("Ignoring long string value label record for "
2535 "unknown variable %s."), var_name);
2536 else if (var_is_numeric (var))
2538 sys_warn (r, record->pos + ofs,
2539 _("Ignoring long string value label record for "
2540 "numeric variable %s."), var_name);
2543 else if (width != var_get_width (var))
2545 sys_warn (r, record->pos + ofs,
2546 _("Ignoring long string value label record for variable "
2547 "%s because the record's width (%d) does not match the "
2548 "variable's width (%d)."),
2549 var_name, width, var_get_width (var));
2554 value_init_pool (r->pool, &value, width);
2555 for (i = 0; i < n_labels; i++)
2557 size_t value_length, label_length;
2558 bool skip = var == NULL;
2560 /* Parse value length. */
2561 if (!check_overflow (r, record, ofs, 4))
2563 value_length = parse_int (r, record->data, ofs);
2567 if (!check_overflow (r, record, ofs, value_length))
2571 if (value_length == width)
2572 memcpy (value.s, (const uint8_t *) record->data + ofs, width);
2575 sys_warn (r, record->pos + ofs,
2576 _("Ignoring long string value label %zu for "
2577 "variable %s, with width %d, that has bad value "
2579 i, var_get_name (var), width, value_length);
2583 ofs += value_length;
2585 /* Parse label length. */
2586 if (!check_overflow (r, record, ofs, 4))
2588 label_length = parse_int (r, record->data, ofs);
2592 if (!check_overflow (r, record, ofs, label_length))
2598 label = recode_string_pool ("UTF-8", dict_encoding,
2599 (const char *) record->data + ofs,
2600 label_length, r->pool);
2601 if (!var_add_value_label (var, &value, label))
2602 sys_warn (r, record->pos + ofs,
2603 _("Duplicate value label for `%.*s' on %s."),
2604 width, value.s, var_get_name (var));
2605 pool_free (r->pool, label);
2607 ofs += label_length;
2613 parse_long_string_missing_values (struct sfm_reader *r,
2614 const struct sfm_extension_record *record,
2615 struct dictionary *dict)
2617 const char *dict_encoding = dict_get_encoding (dict);
2618 size_t end = record->size * record->count;
2623 struct missing_values mv;
2625 struct variable *var;
2626 int n_missing_values;
2630 /* Parse variable name length. */
2631 if (!check_overflow (r, record, ofs, 4))
2633 var_name_len = parse_int (r, record->data, ofs);
2636 /* Parse variable name. */
2637 if (!check_overflow (r, record, ofs, var_name_len)
2638 || !check_overflow (r, record, ofs, var_name_len + 1))
2640 var_name = recode_string_pool ("UTF-8", dict_encoding,
2641 (const char *) record->data + ofs,
2642 var_name_len, r->pool);
2643 ofs += var_name_len;
2645 /* Parse number of missing values. */
2646 n_missing_values = ((const uint8_t *) record->data)[ofs];
2647 if (n_missing_values < 1 || n_missing_values > 3)
2648 sys_warn (r, record->pos + ofs,
2649 _("Long string missing values record says variable %s "
2650 "has %d missing values, but only 1 to 3 missing values "
2652 var_name, n_missing_values);
2655 /* Look up 'var' and validate. */
2656 var = dict_lookup_var (dict, var_name);
2658 sys_warn (r, record->pos + ofs,
2659 _("Ignoring long string missing value record for "
2660 "unknown variable %s."), var_name);
2661 else if (var_is_numeric (var))
2663 sys_warn (r, record->pos + ofs,
2664 _("Ignoring long string missing value record for "
2665 "numeric variable %s."), var_name);
2670 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2671 for (i = 0; i < n_missing_values; i++)
2673 size_t value_length;
2675 /* Parse value length. */
2676 if (!check_overflow (r, record, ofs, 4))
2678 value_length = parse_int (r, record->data, ofs);
2682 if (!check_overflow (r, record, ofs, value_length))
2686 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2688 sys_warn (r, record->pos + ofs,
2689 _("Ignoring long string missing value %zu for variable "
2690 "%s, with width %d, that has bad value width %zu."),
2691 i, var_get_name (var), var_get_width (var),
2693 ofs += value_length;
2696 var_set_missing_values (var, &mv);
2702 static void partial_record (struct sfm_reader *);
2704 static void read_error (struct casereader *, const struct sfm_reader *);
2706 static bool read_case_number (struct sfm_reader *, double *);
2707 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2708 static int read_opcode (struct sfm_reader *);
2709 static bool read_compressed_number (struct sfm_reader *, double *);
2710 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2711 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2712 static bool skip_whole_strings (struct sfm_reader *, size_t);
2714 /* Reads and returns one case from READER's file. Returns a null
2715 pointer if not successful. */
2716 static struct ccase *
2717 sys_file_casereader_read (struct casereader *reader, void *r_)
2719 struct sfm_reader *r = r_;
2724 if (r->error || !r->sfm_var_cnt)
2727 c = case_create (r->proto);
2729 for (i = 0; i < r->sfm_var_cnt; i++)
2731 struct sfm_var *sv = &r->sfm_vars[i];
2732 union value *v = case_data_rw_idx (c, sv->case_index);
2734 if (sv->var_width == 0)
2735 retval = read_case_number (r, &v->f);
2738 retval = read_case_string (r, v->s + sv->offset, sv->segment_width);
2741 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2743 sys_error (r, r->pos, _("File ends in partial string value."));
2755 if (r->case_cnt != -1)
2756 read_error (reader, r);
2761 /* Issues an error that R ends in a partial record. */
2763 partial_record (struct sfm_reader *r)
2765 sys_error (r, r->pos, _("File ends in partial case."));
2768 /* Issues an error that an unspecified error occurred SFM, and
2771 read_error (struct casereader *r, const struct sfm_reader *sfm)
2773 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2774 casereader_force_error (r);
2777 /* Reads a number from R and stores its value in *D.
2778 If R is compressed, reads a compressed number;
2779 otherwise, reads a number in the regular way.
2780 Returns true if successful, false if end of file is
2781 reached immediately. */
2783 read_case_number (struct sfm_reader *r, double *d)
2785 if (r->compression == ANY_COMP_NONE)
2788 if (!try_read_bytes (r, number, sizeof number))
2790 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2794 return read_compressed_number (r, d);
2797 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2798 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2799 discarded without being written to S. Reads compressed strings if S is
2800 compressed. Returns 1 if successful, 0 if end of file is reached
2801 immediately, or -1 for some kind of error. */
2803 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2805 size_t whole = ROUND_DOWN (length, 8);
2806 size_t partial = length % 8;
2810 int retval = read_whole_strings (r, s, whole);
2818 int retval = read_whole_strings (r, bounce, sizeof bounce);
2830 memcpy (s + whole, bounce, partial);
2836 /* Reads and returns the next compression opcode from R. */
2838 read_opcode (struct sfm_reader *r)
2840 assert (r->compression != ANY_COMP_NONE);
2844 if (r->opcode_idx >= sizeof r->opcodes)
2847 int retval = try_read_compressed_bytes (r, r->opcodes,
2853 opcode = r->opcodes[r->opcode_idx++];
2860 /* Reads a compressed number from R and stores its value in D.
2861 Returns true if successful, false if end of file is
2862 reached immediately. */
2864 read_compressed_number (struct sfm_reader *r, double *d)
2866 int opcode = read_opcode (r);
2874 return read_compressed_float (r, d);
2877 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2878 if (!r->corruption_warning)
2880 r->corruption_warning = true;
2881 sys_warn (r, r->pos,
2882 _("Possible compressed data corruption: "
2883 "compressed spaces appear in numeric field."));
2892 *d = opcode - r->bias;
2899 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2901 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2906 opcode = read_opcode (r);
2914 retval = read_compressed_bytes (r, dst, 8);
2915 return retval == 1 ? 1 : -1;
2918 memset (dst, ' ', 8);
2923 double value = opcode - r->bias;
2924 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2927 /* This has actually been seen "in the wild". The submitter of the
2928 file that showed that the contents decoded as spaces, but they
2929 were at the end of the field so it's possible that the null
2930 bytes just acted as null terminators. */
2932 else if (!r->corruption_warning)
2934 r->corruption_warning = true;
2935 sys_warn (r, r->pos,
2936 _("Possible compressed data corruption: "
2937 "string contains compressed integer (opcode %d)."),
2945 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2946 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2947 end of file is reached immediately, or -1 for some kind of error. */
2949 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2951 assert (length % 8 == 0);
2952 if (r->compression == ANY_COMP_NONE)
2953 return try_read_bytes (r, s, length);
2958 for (ofs = 0; ofs < length; ofs += 8)
2960 int retval = read_compressed_string (r, s + ofs);
2975 /* Skips LENGTH string bytes from R.
2976 LENGTH must be a multiple of 8.
2977 (LENGTH is also limited to 1024, but that's only because the
2978 current caller never needs more than that many bytes.)
2979 Returns true if successful, false if end of file is
2980 reached immediately. */
2982 skip_whole_strings (struct sfm_reader *r, size_t length)
2984 uint8_t buffer[1024];
2985 assert (length < sizeof buffer);
2986 return read_whole_strings (r, buffer, length);
2989 /* Helpers for reading records that contain structured text
2992 /* Maximum number of warnings to issue for a single text
2994 #define MAX_TEXT_WARNINGS 5
2999 struct substring buffer; /* Record contents. */
3000 off_t start; /* Starting offset in file. */
3001 size_t pos; /* Current position in buffer. */
3002 int n_warnings; /* Number of warnings issued or suppressed. */
3003 bool recoded; /* Recoded into UTF-8? */
3006 static struct text_record *
3007 open_text_record (struct sfm_reader *r,
3008 const struct sfm_extension_record *record,
3009 bool recode_to_utf8)
3011 struct text_record *text;
3012 struct substring raw;
3014 text = pool_alloc (r->pool, sizeof *text);
3015 raw = ss_buffer (record->data, record->size * record->count);
3016 text->start = record->pos;
3017 text->buffer = (recode_to_utf8
3018 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
3021 text->n_warnings = 0;
3022 text->recoded = recode_to_utf8;
3027 /* Closes TEXT, frees its storage, and issues a final warning
3028 about suppressed warnings if necessary. */
3030 close_text_record (struct sfm_reader *r, struct text_record *text)
3032 if (text->n_warnings > MAX_TEXT_WARNINGS)
3033 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
3034 text->n_warnings - MAX_TEXT_WARNINGS);
3036 pool_free (r->pool, ss_data (text->buffer));
3039 /* Reads a variable=value pair from TEXT.
3040 Looks up the variable in DICT and stores it into *VAR.
3041 Stores a null-terminated value into *VALUE. */
3043 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
3044 struct text_record *text,
3045 struct variable **var, char **value)
3049 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
3052 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
3056 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
3057 ss_buffer ("\t\0", 2));
3065 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
3066 struct text_record *text, struct substring delimiters,
3067 struct variable **var)
3071 name = text_get_token (text, delimiters, NULL);
3075 *var = dict_lookup_var (dict, name);
3079 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3086 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3087 struct text_record *text, struct substring delimiters,
3088 struct variable **var)
3090 char *short_name = text_get_token (text, delimiters, NULL);
3091 if (short_name == NULL)
3094 *var = dict_lookup_var (dict, short_name);
3096 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3101 /* Displays a warning for the current file position, limiting the
3102 number to MAX_TEXT_WARNINGS for TEXT. */
3104 text_warn (struct sfm_reader *r, struct text_record *text,
3105 const char *format, ...)
3107 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3111 va_start (args, format);
3112 sys_msg (r, text->start + text->pos, MW, format, args);
3118 text_get_token (struct text_record *text, struct substring delimiters,
3121 struct substring token;
3124 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3127 end = &ss_data (token)[ss_length (token)];
3128 if (delimiter != NULL)
3131 return ss_data (token);
3134 /* Reads a integer value expressed in decimal, then a space, then a string that
3135 consists of exactly as many bytes as specified by the integer, then a space,
3136 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3137 buffer (so the caller should not free the string). */
3139 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3147 while (text->pos < text->buffer.length)
3149 int c = text->buffer.string[text->pos];
3150 if (c < '0' || c > '9')
3152 n = (n * 10) + (c - '0');
3155 if (text->pos >= text->buffer.length || start == text->pos)
3157 sys_warn (r, text->start,
3158 _("Expecting digit at offset %zu in MRSETS record."),
3163 if (!text_match (text, ' '))
3165 sys_warn (r, text->start,
3166 _("Expecting space at offset %zu in MRSETS record."),
3171 if (text->pos + n > text->buffer.length)
3173 sys_warn (r, text->start,
3174 _("%zu-byte string starting at offset %zu "
3175 "exceeds record length %zu."),
3176 n, text->pos, text->buffer.length);
3180 s = &text->buffer.string[text->pos];
3183 sys_warn (r, text->start,
3184 _("Expecting space at offset %zu following %zu-byte string."),
3194 text_match (struct text_record *text, char c)
3196 if (text->pos >= text->buffer.length)
3199 if (text->buffer.string[text->pos] == c)
3208 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3209 inside the TEXT's string. */
3211 text_pos (const struct text_record *text)
3217 text_get_all (const struct text_record *text)
3219 return text->buffer.string;
3224 /* Displays a corruption message. */
3226 sys_msg (struct sfm_reader *r, off_t offset,
3227 int class, const char *format, va_list args)
3231 ds_init_empty (&text);
3233 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3234 fh_get_file_name (r->fh), (long long int) offset);
3236 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3237 ds_put_vformat (&text, format, args);
3240 .category = msg_class_to_category (class),
3241 .severity = msg_class_to_severity (class),
3242 .text = ds_cstr (&text),
3247 /* Displays a warning for offset OFFSET in the file. */
3249 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3253 va_start (args, format);
3254 sys_msg (r, offset, MW, format, args);
3258 /* Displays an error for the current file position and marks it as in an error
3261 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3265 va_start (args, format);
3266 sys_msg (r, offset, ME, format, args);
3272 /* Reads BYTE_CNT bytes into BUF.
3273 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3274 Returns -1 if an I/O error or a partial read occurs.
3275 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3278 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3279 void *buf, size_t byte_cnt)
3281 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3282 r->pos += bytes_read;
3283 if (bytes_read == byte_cnt)
3285 else if (ferror (r->file))
3287 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3290 else if (!eof_is_ok || bytes_read != 0)
3292 sys_error (r, r->pos, _("Unexpected end of file."));
3299 /* Reads BYTE_CNT into BUF.
3300 Returns true if successful.
3301 Returns false upon I/O error or if end-of-file is encountered. */
3303 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3305 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3308 /* Reads BYTE_CNT bytes into BUF.
3309 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3310 Returns 0 if an immediate end-of-file is encountered.
3311 Returns -1 if an I/O error or a partial read occurs. */
3313 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3315 return read_bytes_internal (r, true, buf, byte_cnt);
3318 /* Reads a 32-bit signed integer from R and stores its value in host format in
3319 *X. Returns true if successful, otherwise false. */
3321 read_int (struct sfm_reader *r, int *x)
3324 if (read_bytes (r, integer, sizeof integer) != 1)
3326 *x = integer_get (r->integer_format, integer, sizeof integer);
3331 read_uint (struct sfm_reader *r, unsigned int *x)
3336 ok = read_int (r, &y);
3341 /* Reads a 64-bit signed integer from R and returns its value in
3344 read_int64 (struct sfm_reader *r, long long int *x)
3347 if (read_bytes (r, integer, sizeof integer) != 1)
3349 *x = integer_get (r->integer_format, integer, sizeof integer);
3353 /* Reads a 64-bit signed integer from R and returns its value in
3356 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3361 ok = read_int64 (r, &y);
3367 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3369 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3373 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3375 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3378 /* Reads exactly SIZE - 1 bytes into BUFFER
3379 and stores a null byte into BUFFER[SIZE - 1]. */
3381 read_string (struct sfm_reader *r, char *buffer, size_t size)
3386 ok = read_bytes (r, buffer, size - 1);
3388 buffer[size - 1] = '\0';
3392 /* Skips BYTES bytes forward in R. */
3394 skip_bytes (struct sfm_reader *r, size_t bytes)
3399 size_t chunk = MIN (sizeof buffer, bytes);
3400 if (!read_bytes (r, buffer, chunk))
3408 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3409 been replaced by LFs.
3411 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3412 files that use CR-only line ends in the file label and extra product
3415 fix_line_ends (const char *s)
3419 d = dst = xmalloc (strlen (s) + 1);
3438 read_ztrailer (struct sfm_reader *r,
3439 long long int zheader_ofs,
3440 long long int ztrailer_len);
3443 zalloc (voidpf pool_, uInt items, uInt size)
3445 struct pool *pool = pool_;
3447 return (!size || xalloc_oversized (items, size)
3449 : pool_malloc (pool, items * size));
3453 zfree (voidpf pool_, voidpf address)
3455 struct pool *pool = pool_;
3457 pool_free (pool, address);
3461 read_zheader (struct sfm_reader *r)
3464 long long int zheader_ofs;
3465 long long int ztrailer_ofs;
3466 long long int ztrailer_len;
3468 if (!read_int64 (r, &zheader_ofs)
3469 || !read_int64 (r, &ztrailer_ofs)
3470 || !read_int64 (r, &ztrailer_len))
3473 if (zheader_ofs != pos)
3475 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3476 "(expected %#llx)."),
3477 zheader_ofs, (long long int) pos);
3481 if (ztrailer_ofs < r->pos)
3483 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3488 if (ztrailer_len < 24 || ztrailer_len % 24)
3490 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3494 r->ztrailer_ofs = ztrailer_ofs;
3495 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3498 if (r->zin_buf == NULL)
3500 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3501 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3502 r->zstream.next_in = NULL;
3503 r->zstream.avail_in = 0;
3506 r->zstream.zalloc = zalloc;
3507 r->zstream.zfree = zfree;
3508 r->zstream.opaque = r->pool;
3510 return open_zstream (r);
3514 seek (struct sfm_reader *r, off_t offset)
3516 if (fseeko (r->file, offset, SEEK_SET))
3517 sys_error (r, 0, _("%s: seek failed (%s)."),
3518 fh_get_file_name (r->fh), strerror (errno));
3522 /* Performs some additional consistency checks on the ZLIB compressed data
3525 read_ztrailer (struct sfm_reader *r,
3526 long long int zheader_ofs,
3527 long long int ztrailer_len)
3529 long long int expected_uncmp_ofs;
3530 long long int expected_cmp_ofs;
3533 unsigned int block_size;
3534 unsigned int n_blocks;
3538 if (fstat (fileno (r->file), &s))
3540 sys_error (r, 0, _("%s: stat failed (%s)."),
3541 fh_get_file_name (r->fh), strerror (errno));
3545 if (!S_ISREG (s.st_mode))
3547 /* We can't seek to the trailer and then back to the data in this file,
3548 so skip doing extra checks. */
3552 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3553 sys_warn (r, r->pos,
3554 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3555 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3557 seek (r, r->ztrailer_ofs);
3559 /* Read fixed header from ZLIB data trailer. */
3560 if (!read_int64 (r, &bias))
3562 if (-bias != r->bias)
3564 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3565 "file header bias (%.2f)."),
3570 if (!read_int64 (r, &zero))
3573 sys_warn (r, r->pos,
3574 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3576 if (!read_uint (r, &block_size))
3578 if (block_size != ZBLOCK_SIZE)
3579 sys_warn (r, r->pos,
3580 _("ZLIB trailer specifies unexpected %u-byte block size."),
3583 if (!read_uint (r, &n_blocks))
3585 if (n_blocks != (ztrailer_len - 24) / 24)
3587 sys_error (r, r->pos,
3588 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3590 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3594 expected_uncmp_ofs = zheader_ofs;
3595 expected_cmp_ofs = zheader_ofs + 24;
3596 for (i = 0; i < n_blocks; i++)
3598 off_t desc_ofs = r->pos;
3599 unsigned long long int uncompressed_ofs;
3600 unsigned long long int compressed_ofs;
3601 unsigned int uncompressed_size;
3602 unsigned int compressed_size;
3604 if (!read_uint64 (r, &uncompressed_ofs)
3605 || !read_uint64 (r, &compressed_ofs)
3606 || !read_uint (r, &uncompressed_size)
3607 || !read_uint (r, &compressed_size))
3610 if (uncompressed_ofs != expected_uncmp_ofs)
3612 sys_error (r, desc_ofs,
3613 _("ZLIB block descriptor %u reported uncompressed data "
3614 "offset %#llx, when %#llx was expected."),
3615 i, uncompressed_ofs, expected_uncmp_ofs);
3619 if (compressed_ofs != expected_cmp_ofs)
3621 sys_error (r, desc_ofs,
3622 _("ZLIB block descriptor %u reported compressed data "
3623 "offset %#llx, when %#llx was expected."),
3624 i, compressed_ofs, expected_cmp_ofs);
3628 if (i < n_blocks - 1)
3630 if (uncompressed_size != block_size)
3631 sys_warn (r, desc_ofs,
3632 _("ZLIB block descriptor %u reported block size %#x, "
3633 "when %#x was expected."),
3634 i, uncompressed_size, block_size);
3638 if (uncompressed_size > block_size)
3639 sys_warn (r, desc_ofs,
3640 _("ZLIB block descriptor %u reported block size %#x, "
3641 "when at most %#x was expected."),
3642 i, uncompressed_size, block_size);
3645 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3646 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3647 This code checks for an expansion of more than 14.3% plus 11
3649 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3651 sys_error (r, desc_ofs,
3652 _("ZLIB block descriptor %u reports compressed size %u "
3653 "and uncompressed size %u."),
3654 i, compressed_size, uncompressed_size);
3658 expected_uncmp_ofs += uncompressed_size;
3659 expected_cmp_ofs += compressed_size;
3662 if (expected_cmp_ofs != r->ztrailer_ofs)
3664 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3665 "would be expected from block descriptors."),
3666 r->ztrailer_ofs, expected_cmp_ofs);
3670 seek (r, zheader_ofs + 24);
3675 open_zstream (struct sfm_reader *r)
3679 r->zout_pos = r->zout_end = 0;
3680 error = inflateInit (&r->zstream);
3683 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3691 close_zstream (struct sfm_reader *r)
3695 error = inflateEnd (&r->zstream);
3698 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3706 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3708 uint8_t *buf = buf_;
3717 /* Use already inflated data if there is any. */
3718 if (r->zout_pos < r->zout_end)
3720 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3721 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3730 /* We need to inflate some more data.
3731 Get some more input data if we don't have any. */
3732 if (r->zstream.avail_in == 0)
3734 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3739 int retval = try_read_bytes (r, r->zin_buf, n);
3742 r->zstream.avail_in = n;
3743 r->zstream.next_in = r->zin_buf;
3747 /* Inflate the (remaining) input data. */
3748 r->zstream.avail_out = ZOUT_BUF_SIZE;
3749 r->zstream.next_out = r->zout_buf;
3750 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3752 r->zout_end = r->zstream.next_out - r->zout_buf;
3753 if (r->zout_end == 0)
3755 if (error != Z_STREAM_END)
3757 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3761 else if (!close_zstream (r) || !open_zstream (r))
3766 /* Process the output data and ignore 'error' for now. ZLIB will
3767 present it to us again on the next inflate() call. */
3773 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3775 if (r->compression == ANY_COMP_SIMPLE)
3776 return read_bytes (r, buf, byte_cnt);
3779 int retval = read_bytes_zlib (r, buf, byte_cnt);
3781 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3787 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3789 if (r->compression == ANY_COMP_SIMPLE)
3790 return try_read_bytes (r, buf, byte_cnt);
3792 return read_bytes_zlib (r, buf, byte_cnt);
3795 /* Reads a 64-bit floating-point number from R and returns its
3796 value in host format. */
3798 read_compressed_float (struct sfm_reader *r, double *d)
3802 if (!read_compressed_bytes (r, number, sizeof number))
3805 *d = float_get_double (r->float_format, number);
3809 static const struct casereader_class sys_file_casereader_class =
3811 sys_file_casereader_read,
3812 sys_file_casereader_destroy,
3817 const struct any_reader_class sys_file_reader_class =
3819 N_("SPSS System File"),