1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016, 2021 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-private.h"
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/ll.h"
49 #include "libpspp/message.h"
50 #include "libpspp/misc.h"
51 #include "libpspp/pool.h"
52 #include "libpspp/str.h"
53 #include "libpspp/stringi-set.h"
55 #include "gl/c-strtod.h"
56 #include "gl/c-ctype.h"
57 #include "gl/inttostr.h"
58 #include "gl/localcharset.h"
59 #include "gl/minmax.h"
60 #include "gl/unlocked-io.h"
61 #include "gl/xalloc.h"
62 #include "gl/xalloc-oversized.h"
66 #define _(msgid) gettext (msgid)
67 #define N_(msgid) (msgid)
71 /* subtypes 0-2 unknown */
72 EXT_INTEGER = 3, /* Machine integer info. */
73 EXT_FLOAT = 4, /* Machine floating-point info. */
74 EXT_VAR_SETS = 5, /* Variable sets. */
75 EXT_DATE = 6, /* DATE. */
76 EXT_MRSETS = 7, /* Multiple response sets. */
77 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
78 /* subtype 9 unknown */
79 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
80 EXT_DISPLAY = 11, /* Variable display parameters. */
81 /* subtype 12 unknown */
82 EXT_LONG_NAMES = 13, /* Long variable names. */
83 EXT_LONG_STRINGS = 14, /* Long strings. */
84 /* subtype 15 unknown */
85 EXT_NCASES = 16, /* Extended number of cases. */
86 EXT_FILE_ATTRS = 17, /* Data file attributes. */
87 EXT_VAR_ATTRS = 18, /* Variable attributes. */
88 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
89 EXT_ENCODING = 20, /* Character encoding. */
90 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
91 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
92 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
95 /* Fields from the top-level header record. */
96 struct sfm_header_record
98 char magic[5]; /* First 4 bytes of file, then null. */
99 int weight_idx; /* 0 if unweighted, otherwise a var index. */
100 int nominal_case_size; /* Number of var positions. */
102 /* These correspond to the members of struct any_file_info or a dictionary
103 but in the system file's encoding rather than ASCII. */
104 char creation_date[10]; /* "dd mmm yy". */
105 char creation_time[9]; /* "hh:mm:ss". */
106 char eye_catcher[61]; /* Eye-catcher string, then product name. */
107 char file_label[65]; /* File label. */
110 struct sfm_var_record
117 int missing_value_code;
120 struct variable *var;
123 struct sfm_value_label
129 struct sfm_value_label_record
132 struct sfm_value_label *labels;
133 unsigned int n_labels;
139 struct sfm_document_record
148 const char *name; /* Name. */
149 const char *label; /* Human-readable label for group. */
150 enum mrset_type type; /* Group type. */
151 const char **vars; /* Constituent variables' names. */
152 size_t n_vars; /* Number of constituent variables. */
155 enum mrset_md_cat_source cat_source; /* Source of category labels. */
156 bool label_from_var_label; /* 'label' taken from variable label? */
157 const char *counted; /* Counted value, as string. */
160 struct sfm_extension_record
162 struct ll ll; /* In struct sfm_reader 'var_attrs' list. */
163 int subtype; /* Record subtype. */
164 off_t pos; /* Starting offset in file. */
165 unsigned int size; /* Size of data elements. */
166 unsigned int count; /* Number of data elements. */
167 void *data; /* Contents. */
170 /* System file reader. */
173 struct any_reader any_reader;
175 /* Resource tracking. */
176 struct pool *pool; /* All system file state. */
179 struct any_read_info info;
180 struct sfm_header_record header;
181 struct sfm_var_record *vars;
183 struct sfm_value_label_record *labels;
185 struct sfm_document_record *document;
186 struct sfm_mrset *mrsets;
188 struct sfm_extension_record *extensions[32];
189 struct ll_list var_attrs; /* Contains "struct sfm_extension_record"s. */
192 struct file_handle *fh; /* File handle. */
193 struct fh_lock *lock; /* Mutual exclusion for file handle. */
194 FILE *file; /* File stream. */
195 off_t pos; /* Position in file. */
196 bool error; /* I/O or corruption error? */
197 struct caseproto *proto; /* Format of output cases. */
200 enum integer_format integer_format; /* On-disk integer format. */
201 enum float_format float_format; /* On-disk floating point format. */
202 struct sfm_var *sfm_vars; /* Variables. */
203 size_t sfm_n_vars; /* Number of variables. */
204 int n_cases; /* Number of cases */
205 const char *encoding; /* String encoding. */
206 bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */
209 enum any_compression compression;
210 double bias; /* Compression bias, usually 100.0. */
211 uint8_t opcodes[8]; /* Current block of opcodes. */
212 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
213 bool corruption_warning; /* Warned about possible corruption? */
215 /* ZLIB decompression. */
216 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
217 #define ZIN_BUF_SIZE 4096
218 uint8_t *zin_buf; /* Inflation input buffer. */
219 #define ZOUT_BUF_SIZE 16384
220 uint8_t *zout_buf; /* Inflation output buffer. */
221 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
222 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
223 z_stream zstream; /* ZLIB inflater. */
226 static const struct casereader_class sys_file_casereader_class;
228 static struct sfm_reader *
229 sfm_reader_cast (const struct any_reader *r_)
231 assert (r_->klass == &sys_file_reader_class);
232 return UP_CAST (r_, struct sfm_reader, any_reader);
235 static bool sfm_close (struct any_reader *);
237 static void sys_msg (struct sfm_reader *r, off_t, int class,
238 const char *format, va_list args)
239 PRINTF_FORMAT (4, 0);
240 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
241 PRINTF_FORMAT (3, 4);
242 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
243 PRINTF_FORMAT (3, 4);
245 static bool read_bytes (struct sfm_reader *, void *, size_t)
247 static int try_read_bytes (struct sfm_reader *, void *, size_t)
249 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
250 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
251 static bool read_int64 (struct sfm_reader *, long long int *)
253 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
255 static bool read_string (struct sfm_reader *, char *, size_t)
257 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
259 /* ZLIB compressed data handling. */
260 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
261 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
262 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
263 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
265 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
267 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
269 static bool read_compressed_float (struct sfm_reader *, double *)
272 static char *fix_line_ends (const char *);
274 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
275 static double parse_float (const struct sfm_reader *,
276 const void *data, size_t ofs);
278 static bool read_variable_record (struct sfm_reader *,
279 struct sfm_var_record *);
280 static bool read_value_label_record (struct sfm_reader *,
281 struct sfm_value_label_record *);
282 static bool read_document_record (struct sfm_reader *);
283 static bool read_extension_record (struct sfm_reader *, int subtype,
284 struct sfm_extension_record **);
285 static bool skip_extension_record (struct sfm_reader *, int subtype);
287 static struct text_record *open_text_record (
288 struct sfm_reader *, const struct sfm_extension_record *,
289 bool recode_to_utf8);
290 static void close_text_record (struct sfm_reader *,
291 struct text_record *);
292 static bool read_variable_to_value_pair (struct sfm_reader *,
294 struct text_record *,
295 struct variable **var, char **value);
296 static void text_warn (struct sfm_reader *r, struct text_record *text,
297 const char *format, ...) PRINTF_FORMAT (3, 4);
298 static char *text_get_token (struct text_record *,
299 struct substring delimiters, char *delimiter);
300 static bool text_match (struct text_record *, char c);
301 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
302 struct text_record *,
303 struct substring delimiters,
305 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
306 struct text_record *,
307 struct substring delimiters,
309 static const char *text_parse_counted_string (struct sfm_reader *,
310 struct text_record *);
311 static size_t text_pos (const struct text_record *);
312 static const char *text_get_all (const struct text_record *);
314 /* Dictionary reader. */
322 static bool read_dictionary (struct sfm_reader *);
323 static bool read_record (struct sfm_reader *, int type,
324 size_t *allocated_vars, size_t *allocated_labels);
325 static bool read_header (struct sfm_reader *, struct any_read_info *,
326 struct sfm_header_record *);
327 static void parse_header (struct sfm_reader *,
328 const struct sfm_header_record *,
329 struct any_read_info *, struct dictionary *);
330 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
331 struct sfm_var_record *, size_t n);
332 static void parse_format_spec (struct sfm_reader *, off_t pos,
333 unsigned int format, enum which_format,
334 struct variable *, int *format_n_warnings);
335 static void parse_document (struct dictionary *, struct sfm_document_record *);
336 static void parse_display_parameters (struct sfm_reader *,
337 const struct sfm_extension_record *,
338 struct dictionary *);
339 static bool parse_machine_integer_info (struct sfm_reader *,
340 const struct sfm_extension_record *,
341 struct any_read_info *);
342 static void parse_machine_float_info (struct sfm_reader *,
343 const struct sfm_extension_record *);
344 static void parse_extra_product_info (struct sfm_reader *,
345 const struct sfm_extension_record *,
346 struct any_read_info *);
347 static void parse_mrsets (struct sfm_reader *,
348 const struct sfm_extension_record *,
349 size_t *allocated_mrsets);
350 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
351 static void parse_long_var_name_map (struct sfm_reader *,
352 const struct sfm_extension_record *,
353 struct dictionary *);
354 static bool parse_long_string_map (struct sfm_reader *,
355 const struct sfm_extension_record *,
356 struct dictionary *);
357 static void parse_value_labels (struct sfm_reader *, struct dictionary *);
358 static struct variable *parse_weight_var (struct sfm_reader *,
359 const struct sfm_var_record *, size_t n_var_recs,
361 static void parse_data_file_attributes (struct sfm_reader *,
362 const struct sfm_extension_record *,
363 struct dictionary *);
364 static void parse_variable_attributes (struct sfm_reader *,
365 const struct sfm_extension_record *,
366 struct dictionary *);
367 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
368 static void parse_long_string_value_labels (struct sfm_reader *,
369 const struct sfm_extension_record *,
370 struct dictionary *);
371 static void parse_long_string_missing_values (
372 struct sfm_reader *, const struct sfm_extension_record *,
373 struct dictionary *);
375 /* Frees the strings inside INFO. */
377 any_read_info_destroy (struct any_read_info *info)
381 free (info->creation_date);
382 free (info->creation_time);
383 free (info->product);
384 free (info->product_ext);
388 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
389 successful, otherwise NULL. */
390 static struct any_reader *
391 sfm_open (struct file_handle *fh)
393 size_t allocated_mrsets = 0;
395 /* Create and initialize reader. */
396 struct sfm_reader *r = XZALLOC (struct sfm_reader);
397 r->any_reader.klass = &sys_file_reader_class;
398 r->pool = pool_create ();
399 pool_register (r->pool, free, r);
401 r->opcode_idx = sizeof r->opcodes;
402 ll_init (&r->var_attrs);
404 /* TRANSLATORS: this fragment will be interpolated into
405 messages in fh_lock() that identify types of files. */
406 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
410 r->file = fn_open (fh, "rb");
413 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
414 fh_get_file_name (r->fh), strerror (errno));
418 if (!read_dictionary (r))
421 if (r->extensions[EXT_MRSETS] != NULL)
422 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
424 if (r->extensions[EXT_MRSETS2] != NULL)
425 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
427 return &r->any_reader;
431 sfm_close (&r->any_reader);
436 read_dictionary (struct sfm_reader *r)
438 size_t allocated_vars;
439 size_t allocated_labels;
441 if (!read_header (r, &r->info, &r->header))
445 allocated_labels = 0;
450 if (!read_int (r, &type))
454 if (!read_record (r, type, &allocated_vars, &allocated_labels))
458 if (!skip_bytes (r, 4))
461 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
468 read_record (struct sfm_reader *r, int type,
469 size_t *allocated_vars, size_t *allocated_labels)
476 if (r->n_vars >= *allocated_vars)
477 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
479 return read_variable_record (r, &r->vars[r->n_vars++]);
482 if (r->n_labels >= *allocated_labels)
483 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
485 return read_value_label_record (r, &r->labels[r->n_labels++]);
488 /* A Type 4 record is always immediately after a type 3 record,
489 so the code for type 3 records reads the type 4 record too. */
490 sys_error (r, r->pos, _("Misplaced type 4 record."));
494 if (r->document != NULL)
495 sys_warn (r, r->pos, _("Duplicate type 6 (document) record."));
496 return read_document_record (r);
499 if (!read_int (r, &subtype))
502 || subtype >= sizeof r->extensions / sizeof *r->extensions)
505 _("Unrecognized record type 7, subtype %d. For help, "
506 "please send this file to %s and mention that you were "
508 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
509 return skip_extension_record (r, subtype);
511 else if (subtype == 18)
513 /* System files written by "Stata 14.1/-savespss- 1.77 by S.Radyakin"
514 put each variable attribute into a separate record with subtype
515 18. I'm surprised that SPSS puts up with this. */
516 struct sfm_extension_record *ext;
517 bool ok = read_extension_record (r, subtype, &ext);
519 ll_push_tail (&r->var_attrs, &ext->ll);
522 else if (r->extensions[subtype] != NULL)
525 _("Record type 7, subtype %d found here has the same "
526 "type as the record found near offset 0x%llx. For "
527 "help, please send this file to %s and mention that "
528 "you were using %s."),
529 subtype, (long long int) r->extensions[subtype]->pos,
530 PACKAGE_BUGREPORT, PACKAGE_STRING);
531 return skip_extension_record (r, subtype);
534 return read_extension_record (r, subtype, &r->extensions[subtype]);
537 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
544 /* Returns the character encoding obtained from R, or a null pointer if R
545 doesn't have an indication of its character encoding. */
547 sfm_get_encoding (const struct sfm_reader *r)
549 /* The EXT_ENCODING record is the best way to determine dictionary
551 if (r->extensions[EXT_ENCODING])
552 return r->extensions[EXT_ENCODING]->data;
554 /* But EXT_INTEGER is better than nothing as a fallback. */
555 if (r->extensions[EXT_INTEGER])
557 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
558 const char *encoding;
567 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
568 respectively. However, many files have character code 2 but data
569 which are clearly not ASCII. Therefore, ignore these values. */
576 encoding = sys_get_encoding_from_codepage (codepage);
577 if (encoding != NULL)
583 /* If the file magic number is EBCDIC then its character data is too. */
584 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
590 struct get_strings_aux
601 add_string__ (struct get_strings_aux *aux,
602 const char *string, bool id, char *title)
604 if (aux->n >= aux->allocated)
606 aux->allocated = 2 * (aux->allocated + 1);
607 aux->titles = pool_realloc (aux->pool, aux->titles,
608 aux->allocated * sizeof *aux->titles);
609 aux->strings = pool_realloc (aux->pool, aux->strings,
610 aux->allocated * sizeof *aux->strings);
611 aux->ids = pool_realloc (aux->pool, aux->ids,
612 aux->allocated * sizeof *aux->ids);
615 aux->titles[aux->n] = title;
616 aux->strings[aux->n] = pool_strdup (aux->pool, string);
617 aux->ids[aux->n] = id;
621 static void PRINTF_FORMAT (3, 4)
622 add_string (struct get_strings_aux *aux,
623 const char *string, const char *title, ...)
627 va_start (args, title);
628 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
632 static void PRINTF_FORMAT (3, 4)
633 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
637 va_start (args, title);
638 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
643 skip_prefix (const char *s, const char *prefix)
645 size_t prefix_len = strlen (prefix);
646 return !strncmp (s, prefix, prefix_len) ? s + prefix_len : s;
649 /* Retrieves significant string data from R in its raw format, to allow the
650 caller to try to detect the encoding in use.
652 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
653 and *STRINGSP to an array of N elements allocated from POOL. For each I in
654 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
655 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
656 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
659 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
660 char ***titlesp, bool **idsp, char ***stringsp)
662 struct sfm_reader *r = sfm_reader_cast (r_);
663 const struct sfm_mrset *mrset;
664 struct get_strings_aux aux;
676 for (i = 0; i < r->n_vars; i++)
677 if (r->vars[i].width != -1)
678 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
681 for (i = 0; i < r->n_vars; i++)
682 if (r->vars[i].width != -1)
685 if (r->vars[i].label)
686 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
691 for (i = 0; i < r->n_labels; i++)
692 for (j = 0; j < r->labels[i].n_labels; j++)
693 add_string (&aux, r->labels[i].labels[j].label,
694 _("Value Label %zu"), k++);
696 add_string (&aux, r->header.creation_date, _("Creation Date"));
697 add_string (&aux, r->header.creation_time, _("Creation Time"));
698 add_string (&aux, skip_prefix (r->header.eye_catcher, "@(#) "), _("Product"));
699 add_string (&aux, r->header.file_label, _("File Label"));
701 if (r->extensions[EXT_PRODUCT_INFO])
702 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
703 _("Extra Product Info"));
709 for (i = 0; i < r->document->n_lines; i++)
713 memcpy (line, r->document->documents + i * 80, 80);
716 add_string (&aux, line, _("Document Line %zu"), i + 1);
720 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
722 size_t mrset_idx = mrset - r->mrsets + 1;
724 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
726 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
728 /* Skip the variables because they ought to be duplicates. */
731 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
735 /* data file attributes */
736 /* variable attributes */
738 /* long string value labels */
739 /* long string missing values */
741 *titlesp = aux.titles;
743 *stringsp = aux.strings;
747 /* Decodes the dictionary read from R, saving it into *DICT. Character
748 strings in R are decoded using ENCODING, or an encoding obtained from R if
749 ENCODING is null, or the locale encoding if R specifies no encoding.
751 If INFOP is non-null, then it receives additional info about the system
752 file, which the caller must eventually free with any_read_info_destroy()
753 when it is no longer needed.
755 This function consumes R. The caller must use it again later, even to
756 destroy it with sfm_close(). */
757 static struct casereader *
758 sfm_decode (struct any_reader *r_, const char *encoding,
759 struct dictionary **dictp, struct any_read_info *infop)
761 struct sfm_reader *r = sfm_reader_cast (r_);
762 struct dictionary *dict;
764 if (encoding == NULL)
766 encoding = sfm_get_encoding (r);
767 if (encoding == NULL)
769 sys_warn (r, -1, _("This system file does not indicate its own "
770 "character encoding. Using default encoding "
771 "%s. For best results, specify an encoding "
772 "explicitly. Use SYSFILE INFO with "
773 "ENCODING=\"DETECT\" to analyze the possible "
776 encoding = locale_charset ();
780 dict = dict_create (encoding);
781 r->encoding = dict_get_encoding (dict);
783 /* These records don't use variables at all. */
784 if (r->document != NULL)
785 parse_document (dict, r->document);
787 if (r->extensions[EXT_INTEGER] != NULL
788 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
791 if (r->extensions[EXT_FLOAT] != NULL)
792 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
794 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
795 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
797 if (r->extensions[EXT_FILE_ATTRS] != NULL)
798 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
800 parse_header (r, &r->header, &r->info, dict);
802 /* Parse the variable records, the basis of almost everything else. */
803 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
806 /* Parse value labels and the weight variable immediately after the variable
807 records. These records use indexes into var_recs[], so we must parse them
808 before those indexes become invalidated by very long string variables. */
809 parse_value_labels (r, dict);
810 if (r->header.weight_idx != 0)
811 dict_set_weight (dict, parse_weight_var (r, r->vars, r->n_vars,
812 r->header.weight_idx));
814 if (r->extensions[EXT_DISPLAY] != NULL)
815 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
817 /* The following records use short names, so they need to be parsed before
818 parse_long_var_name_map() changes short names to long names. */
819 decode_mrsets (r, dict);
821 if (r->extensions[EXT_LONG_STRINGS] != NULL
822 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
825 /* Now rename variables to their long names. */
826 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
828 /* The following records use long names, so they need to follow renaming. */
829 if (!ll_is_empty (&r->var_attrs))
831 struct sfm_extension_record *ext;
832 ll_for_each (ext, struct sfm_extension_record, ll, &r->var_attrs)
833 parse_variable_attributes (r, ext, dict);
835 /* Roles use the $@Role attribute. */
836 assign_variable_roles (r, dict);
838 if (r->extensions[EXT_LONG_LABELS] != NULL)
839 parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
840 if (r->extensions[EXT_LONG_MISSING] != NULL)
841 parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
844 /* Warn if the actual amount of data per case differs from the
845 amount that the header claims. SPSS version 13 gets this
846 wrong when very long strings are involved, so don't warn in
848 if (r->header.nominal_case_size > 0
849 && r->header.nominal_case_size != r->n_vars
850 && r->info.version_major != 13)
851 sys_warn (r, -1, _("File header claims %d variable positions but "
852 "%zu were read from file."),
853 r->header.nominal_case_size, r->n_vars);
855 /* Create an index of dictionary variable widths for
856 sfm_read_case to use. We cannot use the `struct variable's
857 from the dictionary we created, because the caller owns the
858 dictionary and may destroy or modify its variables. */
859 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_n_vars);
860 pool_register (r->pool, free, r->sfm_vars);
861 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
867 memset (&r->info, 0, sizeof r->info);
870 return casereader_create_sequential
871 (NULL, r->proto, r->n_cases == -1 ? CASENUMBER_MAX : r->n_cases,
872 &sys_file_casereader_class, r);
881 /* Closes R, which should have been returned by sfm_open() but not already
882 closed with sfm_decode() or this function.
883 Returns true if an I/O error has occurred on READER, false
886 sfm_close (struct any_reader *r_)
888 struct sfm_reader *r = sfm_reader_cast (r_);
893 if (fn_close (r->fh, r->file) == EOF)
895 msg (ME, _("Error closing system file `%s': %s."),
896 fh_get_file_name (r->fh), strerror (errno));
902 any_read_info_destroy (&r->info);
907 pool_destroy (r->pool);
912 /* Destroys READER. */
914 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
916 struct sfm_reader *r = r_;
917 sfm_close (&r->any_reader);
920 /* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
921 a negative errno value if there is an error reading FILE. */
923 sfm_detect (FILE *file)
927 if (fseek (file, 0, SEEK_SET) != 0)
929 if (fread (magic, 4, 1, file) != 1)
930 return ferror (file) ? -errno : 0;
933 return (!strcmp (ASCII_MAGIC, magic)
934 || !strcmp (ASCII_ZMAGIC, magic)
935 || !strcmp (EBCDIC_MAGIC, magic));
938 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
939 except for the string fields in *INFO, which parse_header() will initialize
940 later once the file's encoding is known. */
942 read_header (struct sfm_reader *r, struct any_read_info *info,
943 struct sfm_header_record *header)
945 uint8_t raw_layout_code[4];
950 if (!read_string (r, header->magic, sizeof header->magic)
951 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
953 r->written_by_readstat = strstr (header->eye_catcher,
954 "https://github.com/WizardMac/ReadStat");
956 if (!strcmp (ASCII_MAGIC, header->magic)
957 || !strcmp (EBCDIC_MAGIC, header->magic))
959 else if (!strcmp (ASCII_ZMAGIC, header->magic))
963 sys_error (r, 0, _("This is not an SPSS system file."));
967 /* Identify integer format. */
968 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
970 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
972 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
974 || (r->integer_format != INTEGER_MSB_FIRST
975 && r->integer_format != INTEGER_LSB_FIRST))
977 sys_error (r, 64, _("This is not an SPSS system file."));
981 if (!read_int (r, &header->nominal_case_size))
984 if (header->nominal_case_size < 0
985 || header->nominal_case_size > INT_MAX / 16)
986 header->nominal_case_size = -1;
988 if (!read_int (r, &compressed))
993 r->compression = ANY_COMP_NONE;
994 else if (compressed == 1)
995 r->compression = ANY_COMP_SIMPLE;
998 sys_error (r, 0, "System file header has invalid compression "
999 "value %d.", compressed);
1005 if (compressed == 2)
1006 r->compression = ANY_COMP_ZLIB;
1009 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1010 "compression value %d.", compressed);
1015 if (!read_int (r, &header->weight_idx))
1018 if (!read_int (r, &r->n_cases))
1020 if (r->n_cases > INT_MAX / 2)
1023 /* Identify floating-point format and obtain compression bias. */
1024 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1026 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1028 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1030 if (memcmp (raw_bias, zero_bias, 8))
1031 sys_warn (r, r->pos - 8,
1032 _("Compression bias is not the usual "
1033 "value of 100, or system file uses unrecognized "
1034 "floating-point format."));
1037 /* Some software is known to write all-zeros to this
1038 field. Such software also writes floating-point
1039 numbers in the format that we expect by default
1040 (it seems that all software most likely does, in
1041 reality), so don't warn in this case. */
1044 if (r->integer_format == INTEGER_MSB_FIRST)
1045 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1047 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1049 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1051 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1052 || !read_string (r, header->creation_time, sizeof header->creation_time)
1053 || !read_string (r, header->file_label, sizeof header->file_label)
1054 || !skip_bytes (r, 3))
1057 info->integer_format = r->integer_format;
1058 info->float_format = r->float_format;
1059 info->compression = r->compression;
1060 info->n_cases = r->n_cases;
1065 /* Reads a variable (type 2) record from R into RECORD. */
1067 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1069 int has_variable_label;
1071 memset (record, 0, sizeof *record);
1073 record->pos = r->pos;
1074 if (!read_int (r, &record->width)
1075 || !read_int (r, &has_variable_label)
1076 || !read_int (r, &record->missing_value_code)
1077 || !read_int (r, &record->print_format)
1078 || !read_int (r, &record->write_format)
1079 || !read_string (r, record->name, sizeof record->name))
1082 if (has_variable_label == 1)
1084 enum { MAX_LABEL_LEN = 65536 };
1085 unsigned int len, read_len;
1087 if (!read_uint (r, &len))
1090 /* Read up to MAX_LABEL_LEN bytes of label. */
1091 read_len = MIN (MAX_LABEL_LEN, len);
1092 record->label = pool_malloc (r->pool, read_len + 1);
1093 if (!read_string (r, record->label, read_len + 1))
1096 /* Skip unread label bytes. */
1097 if (!skip_bytes (r, len - read_len))
1100 /* Skip label padding up to multiple of 4 bytes. */
1101 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1104 else if (has_variable_label != 0)
1106 sys_error (r, record->pos,
1107 _("Variable label indicator field is not 0 or 1."));
1111 /* Set missing values. */
1112 if (record->missing_value_code != 0)
1114 int code = record->missing_value_code;
1115 if (record->width == 0)
1117 if (code < -3 || code > 3 || code == -1)
1119 sys_error (r, record->pos,
1120 _("Numeric missing value indicator field is not "
1121 "-3, -2, 0, 1, 2, or 3."));
1127 if (code < 1 || code > 3)
1129 sys_error (r, record->pos,
1130 _("String missing value indicator field is not "
1136 if (!read_bytes (r, record->missing, 8 * abs (code)))
1143 /* Reads value labels from R into RECORD. */
1145 read_value_label_record (struct sfm_reader *r,
1146 struct sfm_value_label_record *record)
1151 /* Read type 3 record. */
1152 record->pos = r->pos;
1153 if (!read_uint (r, &record->n_labels))
1155 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1157 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1161 record->labels = pool_nmalloc (r->pool, record->n_labels,
1162 sizeof *record->labels);
1163 for (i = 0; i < record->n_labels; i++)
1165 struct sfm_value_label *label = &record->labels[i];
1166 unsigned char label_len;
1169 if (!read_bytes (r, label->value, sizeof label->value))
1172 /* Read label length. */
1173 if (!read_bytes (r, &label_len, sizeof label_len))
1175 padded_len = ROUND_UP (label_len + 1, 8);
1177 /* Read label, padding. */
1178 label->label = pool_malloc (r->pool, padded_len + 1);
1179 if (!read_bytes (r, label->label, padded_len - 1))
1181 label->label[label_len] = '\0';
1184 /* Read record type of type 4 record. */
1185 if (!read_int (r, &type))
1189 sys_error (r, r->pos - 4,
1190 _("Variable index record (type 4) does not immediately "
1191 "follow value label record (type 3) as it should."));
1195 /* Read number of variables associated with value label from type 4
1197 if (!read_uint (r, &record->n_vars))
1199 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1201 sys_error (r, r->pos - 4,
1202 _("Number of variables associated with a value label (%u) "
1203 "is not between 1 and the number of variables (%zu)."),
1204 record->n_vars, r->n_vars);
1208 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1209 for (i = 0; i < record->n_vars; i++)
1210 if (!read_int (r, &record->vars[i]))
1216 /* Reads a document record from R. Returns true if successful, false on
1219 read_document_record (struct sfm_reader *r)
1222 if (!read_int (r, &n_lines))
1224 else if (n_lines == 0)
1226 else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1228 sys_error (r, r->pos,
1229 _("Number of document lines (%d) "
1230 "must be greater than 0 and less than %d."),
1231 n_lines, INT_MAX / DOC_LINE_LENGTH);
1235 struct sfm_document_record *record;
1236 record = pool_malloc (r->pool, sizeof *record);
1237 record->pos = r->pos;
1238 record->n_lines = n_lines;
1239 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1240 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1243 r->document = record;
1248 read_extension_record_header (struct sfm_reader *r, int subtype,
1249 struct sfm_extension_record *record)
1251 record->subtype = subtype;
1252 record->pos = r->pos;
1253 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1256 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1257 allows an extra byte for a null terminator, used by some
1258 extension processing routines. */
1259 if (record->size != 0
1260 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1262 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1270 /* Reads an extension record from R into RECORD. */
1272 read_extension_record (struct sfm_reader *r, int subtype,
1273 struct sfm_extension_record **recordp)
1275 struct extension_record_type
1282 static const struct extension_record_type types[] =
1284 /* Implemented record types. */
1285 { EXT_INTEGER, 4, 8 },
1286 { EXT_FLOAT, 8, 3 },
1287 { EXT_MRSETS, 1, 0 },
1288 { EXT_PRODUCT_INFO, 1, 0 },
1289 { EXT_DISPLAY, 4, 0 },
1290 { EXT_LONG_NAMES, 1, 0 },
1291 { EXT_LONG_STRINGS, 1, 0 },
1292 { EXT_NCASES, 8, 2 },
1293 { EXT_FILE_ATTRS, 1, 0 },
1294 { EXT_VAR_ATTRS, 1, 0 },
1295 { EXT_MRSETS2, 1, 0 },
1296 { EXT_ENCODING, 1, 0 },
1297 { EXT_LONG_LABELS, 1, 0 },
1298 { EXT_LONG_MISSING, 1, 0 },
1300 /* Ignored record types. */
1301 { EXT_VAR_SETS, 0, 0 },
1303 { EXT_DATA_ENTRY, 0, 0 },
1304 { EXT_DATAVIEW, 0, 0 },
1307 const struct extension_record_type *type;
1308 struct sfm_extension_record *record;
1312 record = pool_malloc (r->pool, sizeof *record);
1313 if (!read_extension_record_header (r, subtype, record))
1315 n_bytes = record->count * record->size;
1317 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1318 if (subtype == type->subtype)
1320 if (type->size > 0 && record->size != type->size)
1321 sys_warn (r, record->pos,
1322 _("Record type 7, subtype %d has bad size %u "
1323 "(expected %d)."), subtype, record->size, type->size);
1324 else if (type->count > 0 && record->count != type->count)
1325 sys_warn (r, record->pos,
1326 _("Record type 7, subtype %d has bad count %u "
1327 "(expected %d)."), subtype, record->count, type->count);
1328 else if (type->count == 0 && type->size == 0)
1330 /* Ignore this record. */
1334 char *data = pool_malloc (r->pool, n_bytes + 1);
1335 data[n_bytes] = '\0';
1337 record->data = data;
1338 if (!read_bytes (r, record->data, n_bytes))
1347 sys_warn (r, record->pos,
1348 _("Unrecognized record type 7, subtype %d. For help, please "
1349 "send this file to %s and mention that you were using %s."),
1350 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
1353 return skip_bytes (r, n_bytes);
1357 skip_extension_record (struct sfm_reader *r, int subtype)
1359 struct sfm_extension_record record;
1361 return (read_extension_record_header (r, subtype, &record)
1362 && skip_bytes (r, record.count * record.size));
1366 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1367 struct any_read_info *info, struct dictionary *dict)
1369 const char *dict_encoding = dict_get_encoding (dict);
1370 struct substring product;
1371 struct substring label;
1374 /* Convert file label to UTF-8 and put it into DICT. */
1375 label = recode_substring_pool ("UTF-8", dict_encoding,
1376 ss_cstr (header->file_label), r->pool);
1377 ss_trim (&label, ss_cstr (" "));
1378 label.string[label.length] = '\0';
1379 fixed_label = fix_line_ends (label.string);
1380 dict_set_label (dict, fixed_label);
1383 /* Put creation date and time in UTF-8 into INFO. */
1384 info->creation_date = recode_string ("UTF-8", dict_encoding,
1385 header->creation_date, -1);
1386 info->creation_time = recode_string ("UTF-8", dict_encoding,
1387 header->creation_time, -1);
1389 /* Put product name into INFO, dropping eye-catcher string if present. */
1390 product = recode_substring_pool ("UTF-8", dict_encoding,
1391 ss_cstr (header->eye_catcher), r->pool);
1392 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1393 ss_trim (&product, ss_cstr (" "));
1394 info->product = ss_xstrdup (product);
1397 static struct variable *
1398 add_var_with_generated_name (struct dictionary *dict, int width)
1400 char *name = dict_make_unique_var_name (dict, NULL, NULL);
1401 struct variable *var = dict_create_var_assert (dict, name, width);
1406 /* Reads a variable (type 2) record from R and adds the
1407 corresponding variable to DICT.
1408 Also skips past additional variable records for long string
1411 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1412 struct sfm_var_record *var_recs, size_t n_var_recs)
1414 const char *dict_encoding = dict_get_encoding (dict);
1415 struct sfm_var_record *rec;
1418 for (rec = var_recs; rec < &var_recs[n_var_recs];)
1424 name = recode_string_pool ("UTF-8", dict_encoding,
1425 rec->name, -1, r->pool);
1426 name[strcspn (name, " ")] = '\0';
1428 if (rec->width < 0 || rec->width > 255)
1430 sys_error (r, rec->pos,
1431 _("Bad width %d for variable %s."), rec->width, name);
1435 struct variable *var;
1436 if (!dict_id_is_valid (dict, name) || name[0] == '$' || name[0] == '#')
1438 var = add_var_with_generated_name (dict, rec->width);
1439 sys_warn (r, rec->pos, _("Renaming variable with invalid name "
1440 "`%s' to `%s'."), name, var_get_name (var));
1444 var = dict_create_var (dict, name, rec->width);
1447 var = add_var_with_generated_name (dict, rec->width);
1448 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1450 name, var_get_name (var));
1455 /* Set the short name the same as the long name (even if we renamed
1457 var_set_short_name (var, 0, var_get_name (var));
1459 /* Get variable label, if any. */
1464 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1465 rec->label, -1, r->pool);
1466 var_set_label (var, utf8_label);
1469 /* Set missing values. */
1470 if (rec->missing_value_code != 0)
1472 int width = var_get_width (var);
1473 struct missing_values mv;
1475 mv_init_pool (r->pool, &mv, width);
1476 if (var_is_numeric (var))
1478 bool has_range = rec->missing_value_code < 0;
1479 int n_discrete = (has_range
1480 ? rec->missing_value_code == -3
1481 : rec->missing_value_code);
1486 double low = parse_float (r, rec->missing, 0);
1487 double high = parse_float (r, rec->missing, 8);
1489 /* Deal with SPSS 21 change in representation. */
1493 mv_add_range (&mv, low, high);
1497 for (i = 0; i < n_discrete; i++)
1499 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1504 for (i = 0; i < rec->missing_value_code; i++)
1505 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1506 var_set_missing_values (var, &mv);
1510 parse_format_spec (r, rec->pos + 12, rec->print_format,
1511 PRINT_FORMAT, var, &n_warnings);
1512 parse_format_spec (r, rec->pos + 16, rec->write_format,
1513 WRITE_FORMAT, var, &n_warnings);
1515 /* Account for values.
1516 Skip long string continuation records, if any. */
1517 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1518 for (i = 1; i < n_values; i++)
1519 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1521 sys_error (r, rec->pos, _("Missing string continuation record."));
1530 /* Translates the format spec from sysfile format to internal
1533 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1534 enum which_format which, struct variable *v,
1537 const int max_warnings = 8;
1540 if (fmt_from_u32 (format, var_get_width (v), false, &f))
1542 if (which == PRINT_FORMAT)
1543 var_set_print_format (v, &f);
1545 var_set_write_format (v, &f);
1547 else if (format == 0)
1549 /* Actually observed in the wild. No point in warning about it. */
1551 else if (++*n_warnings <= max_warnings)
1553 if (which == PRINT_FORMAT)
1554 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1556 var_get_name (v), var_get_width (v), format);
1558 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1560 var_get_name (v), var_get_width (v), format);
1562 if (*n_warnings == max_warnings)
1563 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1568 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1572 for (p = record->documents;
1573 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1574 p += DOC_LINE_LENGTH)
1576 struct substring line;
1578 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1579 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1580 ss_rtrim (&line, ss_cstr (" "));
1581 line.string[line.length] = '\0';
1583 dict_add_document_line (dict, line.string, false);
1589 /* Parses record type 7, subtype 3. */
1591 parse_machine_integer_info (struct sfm_reader *r,
1592 const struct sfm_extension_record *record,
1593 struct any_read_info *info)
1595 int float_representation, expected_float_format;
1596 int integer_representation, expected_integer_format;
1598 /* Save version info. */
1599 info->version_major = parse_int (r, record->data, 0);
1600 info->version_minor = parse_int (r, record->data, 4);
1601 info->version_revision = parse_int (r, record->data, 8);
1603 /* Check floating point format. */
1604 float_representation = parse_int (r, record->data, 16);
1605 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1606 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1607 expected_float_format = 1;
1608 else if (r->float_format == FLOAT_Z_LONG)
1609 expected_float_format = 2;
1610 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1611 expected_float_format = 3;
1614 if (float_representation != expected_float_format)
1616 sys_error (r, record->pos,
1617 _("Floating-point representation indicated by "
1618 "system file (%d) differs from expected (%d)."),
1619 float_representation, expected_float_format);
1623 /* Check integer format. */
1624 integer_representation = parse_int (r, record->data, 24);
1625 if (r->integer_format == INTEGER_MSB_FIRST)
1626 expected_integer_format = 1;
1627 else if (r->integer_format == INTEGER_LSB_FIRST)
1628 expected_integer_format = 2;
1631 if (integer_representation != expected_integer_format)
1632 sys_warn (r, record->pos,
1633 _("Integer format indicated by system file (%d) "
1634 "differs from expected (%d)."),
1635 integer_representation, expected_integer_format);
1640 /* Parses record type 7, subtype 4. */
1642 parse_machine_float_info (struct sfm_reader *r,
1643 const struct sfm_extension_record *record)
1645 double sysmis = parse_float (r, record->data, 0);
1646 double highest = parse_float (r, record->data, 8);
1647 double lowest = parse_float (r, record->data, 16);
1649 if (sysmis != SYSMIS)
1650 sys_warn (r, record->pos,
1651 _("File specifies unexpected value %g (%a) as %s, "
1652 "instead of %g (%a)."),
1653 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1655 if (highest != HIGHEST)
1656 sys_warn (r, record->pos,
1657 _("File specifies unexpected value %g (%a) as %s, "
1658 "instead of %g (%a)."),
1659 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1661 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1662 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1663 appears in a context (missing values) where SYSMIS cannot. */
1664 if (lowest != LOWEST && lowest != SYSMIS)
1665 sys_warn (r, record->pos,
1666 _("File specifies unexpected value %g (%a) as %s, "
1667 "instead of %g (%a) or %g (%a)."),
1668 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1671 /* Parses record type 7, subtype 10. */
1673 parse_extra_product_info (struct sfm_reader *r,
1674 const struct sfm_extension_record *record,
1675 struct any_read_info *info)
1677 struct text_record *text;
1679 text = open_text_record (r, record, true);
1680 info->product_ext = fix_line_ends (text_get_all (text));
1681 close_text_record (r, text);
1684 /* Parses record type 7, subtype 7 or 19. */
1686 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1687 size_t *allocated_mrsets)
1689 struct text_record *text;
1691 text = open_text_record (r, record, false);
1694 struct sfm_mrset *mrset = NULL;
1695 size_t allocated_vars = 0;
1696 char delimiter = '4';
1698 /* Skip extra line feeds if present. */
1699 while (text_match (text, '\n'))
1702 if (r->n_mrsets >= *allocated_mrsets)
1703 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1705 mrset = &r->mrsets[r->n_mrsets];
1706 memset(mrset, 0, sizeof *mrset);
1708 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1709 if (mrset->name == NULL)
1712 if (text_match (text, 'C'))
1714 mrset->type = MRSET_MC;
1715 if (!text_match (text, ' '))
1717 sys_warn (r, record->pos,
1718 _("Missing space following `%c' at offset %zu "
1719 "in MRSETS record."), 'C', text_pos (text));
1723 else if (text_match (text, 'D'))
1725 mrset->type = MRSET_MD;
1726 mrset->cat_source = MRSET_VARLABELS;
1728 else if (text_match (text, 'E'))
1732 mrset->type = MRSET_MD;
1733 mrset->cat_source = MRSET_COUNTEDVALUES;
1734 if (!text_match (text, ' '))
1736 sys_warn (r, record->pos,
1737 _("Missing space following `%c' at offset %zu "
1738 "in MRSETS record."), 'E', text_pos (text));
1742 number = text_get_token (text, ss_cstr (" "), NULL);
1744 sys_warn (r, record->pos,
1745 _("Missing label source value "
1746 "following `E' at offset %zu in MRSETS record."),
1748 else if (!strcmp (number, "11"))
1749 mrset->label_from_var_label = true;
1750 else if (strcmp (number, "1"))
1751 sys_warn (r, record->pos,
1752 _("Unexpected label source value following `E' "
1753 "at offset %zu in MRSETS record."),
1758 sys_warn (r, record->pos,
1759 _("Missing `C', `D', or `E' at offset %zu "
1760 "in MRSETS record."),
1765 if (mrset->type == MRSET_MD)
1767 mrset->counted = text_parse_counted_string (r, text);
1768 if (mrset->counted == NULL)
1772 mrset->label = text_parse_counted_string (r, text);
1773 if (mrset->label == NULL)
1781 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1784 if (delimiter != '\n')
1785 sys_warn (r, record->pos,
1786 _("Missing new-line parsing variable names "
1787 "at offset %zu in MRSETS record."),
1792 if (mrset->n_vars >= allocated_vars)
1793 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1795 sizeof *mrset->vars);
1796 mrset->vars[mrset->n_vars++] = var;
1798 while (delimiter != '\n');
1802 close_text_record (r, text);
1806 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1808 const struct sfm_mrset *s;
1810 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1812 struct stringi_set var_names;
1813 struct mrset *mrset;
1818 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1819 if (!mrset_is_valid_name (name, dict_get_encoding (dict)))
1821 sys_warn (r, -1, _("Invalid multiple response set name `%s'."),
1827 mrset = xzalloc (sizeof *mrset);
1829 mrset->type = s->type;
1830 mrset->cat_source = s->cat_source;
1831 mrset->label_from_var_label = s->label_from_var_label;
1832 if (s->label[0] != '\0')
1833 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1835 stringi_set_init (&var_names);
1836 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1838 for (i = 0; i < s->n_vars; i++)
1840 struct variable *var;
1843 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1845 var = dict_lookup_var (dict, var_name);
1851 if (!stringi_set_insert (&var_names, var_name))
1854 _("MRSET %s contains duplicate variable name %s."),
1855 mrset->name, var_name);
1861 if (mrset->label == NULL && mrset->label_from_var_label
1862 && var_has_label (var))
1863 mrset->label = xstrdup (var_get_label (var));
1866 && var_get_type (var) != var_get_type (mrset->vars[0]))
1869 _("MRSET %s contains both string and "
1870 "numeric variables."), mrset->name);
1873 width = MIN (width, var_get_width (var));
1875 mrset->vars[mrset->n_vars++] = var;
1878 if (mrset->n_vars < 2)
1880 if (mrset->n_vars == 0)
1881 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1883 sys_warn (r, -1, _("MRSET %s has only one variable."),
1885 mrset_destroy (mrset);
1886 stringi_set_destroy (&var_names);
1890 if (mrset->type == MRSET_MD)
1892 mrset->width = width;
1893 value_init (&mrset->counted, width);
1895 mrset->counted.f = c_strtod (s->counted, NULL);
1897 value_copy_str_rpad (&mrset->counted, width,
1898 (const uint8_t *) s->counted, ' ');
1901 dict_add_mrset (dict, mrset);
1902 stringi_set_destroy (&var_names);
1906 /* Read record type 7, subtype 11, which specifies how variables
1907 should be displayed in GUI environments. */
1909 parse_display_parameters (struct sfm_reader *r,
1910 const struct sfm_extension_record *record,
1911 struct dictionary *dict)
1913 bool includes_width;
1914 bool warned = false;
1919 n_vars = dict_get_n_vars (dict);
1920 if (record->count == 3 * n_vars)
1921 includes_width = true;
1922 else if (record->count == 2 * n_vars)
1923 includes_width = false;
1926 sys_warn (r, record->pos,
1927 _("Extension 11 has bad count %u (for %zu variables)."),
1928 record->count, n_vars);
1933 for (i = 0; i < n_vars; ++i)
1935 struct variable *v = dict_get_var (dict, i);
1936 int measure, width, align;
1938 measure = parse_int (r, record->data, ofs);
1943 width = parse_int (r, record->data, ofs);
1949 align = parse_int (r, record->data, ofs);
1952 /* SPSS sometimes seems to set variables' measure to zero. */
1956 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1959 sys_warn (r, record->pos,
1960 _("Invalid variable display parameters for variable "
1961 "%zu (%s). Default parameters substituted."),
1962 i, var_get_name (v));
1967 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1968 : measure == 2 ? MEASURE_ORDINAL
1970 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1971 : align == 1 ? ALIGN_RIGHT
1974 /* Older versions (SPSS 9.0) sometimes set the display
1975 width to zero. This causes confusion in the GUI, so
1976 only set the width if it is nonzero. */
1978 var_set_display_width (v, width);
1983 rename_var_and_save_short_names (struct sfm_reader *r, off_t pos,
1984 struct dictionary *dict,
1985 struct variable *var, const char *new_name)
1987 size_t n_short_names;
1991 /* Renaming a variable may clear its short names, but we
1992 want to retain them, so we save them and re-set them
1994 n_short_names = var_get_n_short_names (var);
1995 short_names = xnmalloc (n_short_names, sizeof *short_names);
1996 for (i = 0; i < n_short_names; i++)
1998 const char *s = var_get_short_name (var, i);
1999 short_names[i] = xstrdup_if_nonnull (s);
2002 /* Set long name. */
2003 if (!dict_try_rename_var (dict, var, new_name))
2004 sys_warn (r, pos, _("Duplicate long variable name `%s'."), new_name);
2006 /* Restore short names. */
2007 for (i = 0; i < n_short_names; i++)
2009 var_set_short_name (var, i, short_names[i]);
2010 free (short_names[i]);
2015 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2016 to each short name. Modifies variable names in DICT accordingly. */
2018 parse_long_var_name_map (struct sfm_reader *r,
2019 const struct sfm_extension_record *record,
2020 struct dictionary *dict)
2022 struct text_record *text;
2023 struct variable *var;
2028 /* There are no long variable names. Use the short variable names,
2029 converted to lowercase, as the long variable names. */
2032 for (i = 0; i < dict_get_n_vars (dict); i++)
2034 struct variable *var = dict_get_var (dict, i);
2037 new_name = utf8_to_lower (var_get_name (var));
2038 rename_var_and_save_short_names (r, -1, dict, var, new_name);
2045 /* Rename each of the variables, one by one. (In a correctly constructed
2046 system file, this cannot create any intermediate duplicate variable names,
2047 because all of the new variable names are longer than any of the old
2048 variable names and thus there cannot be any overlaps.) */
2049 text = open_text_record (r, record, true);
2050 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2052 /* Validate long name. */
2053 if (!dict_id_is_valid (dict, long_name)
2054 || long_name[0] == '$' || long_name[0] == '#')
2056 sys_warn (r, record->pos,
2057 _("Long variable mapping from %s to invalid "
2058 "variable name `%s'."),
2059 var_get_name (var), long_name);
2063 rename_var_and_save_short_names (r, record->pos, dict, var, long_name);
2065 close_text_record (r, text);
2068 /* Reads record type 7, subtype 14, which gives the real length
2069 of each very long string. Rearranges DICT accordingly. */
2071 parse_long_string_map (struct sfm_reader *r,
2072 const struct sfm_extension_record *record,
2073 struct dictionary *dict)
2075 struct text_record *text;
2076 struct variable *var;
2079 text = open_text_record (r, record, true);
2080 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2082 size_t idx = var_get_dict_index (var);
2087 length = strtol (length_s, NULL, 10);
2088 if (length < 1 || length > MAX_STRING)
2090 sys_warn (r, record->pos,
2091 _("%s listed as string of invalid length %s "
2092 "in very long string record."),
2093 var_get_name (var), length_s);
2097 /* Check segments. */
2098 int n_segments = sfm_width_to_segments (length);
2099 if (n_segments == 1)
2101 sys_warn (r, record->pos,
2102 _("%s listed in very long string record with width %s, "
2103 "which requires only one segment."),
2104 var_get_name (var), length_s);
2107 if (idx + n_segments > dict_get_n_vars (dict))
2109 sys_error (r, record->pos,
2110 _("Very long string %s overflows dictionary."),
2111 var_get_name (var));
2115 /* Get the short names from the segments and check their
2117 for (i = 0; i < n_segments; i++)
2119 struct variable *seg = dict_get_var (dict, idx + i);
2120 int alloc_width = sfm_segment_alloc_width (length, i);
2121 int width = var_get_width (seg);
2124 var_set_short_name (var, i, var_get_short_name (seg, 0));
2125 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2127 sys_error (r, record->pos,
2128 _("Very long string with width %ld has segment %d "
2129 "of width %d (expected %d)."),
2130 length, i, width, alloc_width);
2134 dict_delete_consecutive_vars (dict, idx + 1, n_segments - 1);
2135 var_set_width (var, length);
2137 close_text_record (r, text);
2138 dict_compact_values (dict);
2143 #define MAX_LABEL_WARNINGS 5
2145 /* Displays a warning for offset OFFSET in the file. */
2147 value_label_warning (struct sfm_reader *r, off_t offset, int *n_label_warnings,
2148 const char *format, ...)
2150 if (++*n_label_warnings > MAX_LABEL_WARNINGS)
2155 va_start (args, format);
2156 sys_msg (r, offset, MW, format, args);
2160 #define MAX_LABEL_WARNINGS 5
2163 parse_one_value_label_set (struct sfm_reader *r, struct dictionary *dict,
2164 const struct sfm_var_record *var_recs,
2166 const struct sfm_value_label_record *record,
2167 int *n_label_warnings)
2170 = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2171 for (size_t i = 0; i < record->n_labels; i++)
2172 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2173 record->labels[i].label, -1,
2176 struct variable **vars = pool_nmalloc (r->pool,
2177 record->n_vars, sizeof *vars);
2178 unsigned int n_vars = 0;
2179 for (size_t i = 0; i < record->n_vars; i++)
2181 int idx = record->vars[i];
2182 if (idx < 1 || idx > n_var_recs)
2184 value_label_warning (
2185 r, record->pos, n_label_warnings,
2186 _("Value label variable index %d not in valid range 1...%zu."),
2191 const struct sfm_var_record *rec = &var_recs[idx - 1];
2192 if (rec->var == NULL)
2194 value_label_warning (
2195 r, record->pos, n_label_warnings,
2196 _("Value label variable index %d "
2197 "refers to long string continuation."), idx);
2201 vars[n_vars++] = rec->var;
2206 for (size_t i = 1; i < n_vars; i++)
2207 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2209 value_label_warning (
2210 r, record->pos, n_label_warnings,
2211 _("Variables associated with value label are not all of "
2212 "identical type. Variable %s is %s, but variable "
2214 var_get_name (vars[0]),
2215 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2216 var_get_name (vars[i]),
2217 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2221 for (size_t i = 0; i < n_vars; i++)
2223 struct variable *var = vars[i];
2224 int width = var_get_width (var);
2227 value_label_warning (
2228 r, record->pos, n_label_warnings,
2229 _("Value labels may not be added to long string "
2230 "variables (e.g. %s) using records types 3 and 4."),
2231 var_get_name (var));
2235 for (size_t j = 0; j < record->n_labels; j++)
2237 struct sfm_value_label *label = &record->labels[j];
2240 value_init (&value, width);
2242 value.f = parse_float (r, label->value, 0);
2244 memcpy (value.s, label->value, width);
2246 if (!var_add_value_label (var, &value, utf8_labels[j]))
2248 if (r->written_by_readstat)
2250 /* Ignore the problem. ReadStat is buggy and emits value
2251 labels whose values are longer than string variables'
2252 widths, that are identical in the actual width of the
2253 variable, e.g. both values "ABC123" and "ABC456" for a
2254 string variable with width 3. */
2256 else if (var_is_numeric (var))
2257 value_label_warning (r, record->pos, n_label_warnings,
2258 _("Duplicate value label for %g on %s."),
2259 value.f, var_get_name (var));
2261 value_label_warning (
2262 r, record->pos, n_label_warnings,
2263 _("Duplicate value label for `%.*s' on %s."),
2264 width, value.s, var_get_name (var));
2267 value_destroy (&value, width);
2271 pool_free (r->pool, vars);
2272 for (size_t i = 0; i < record->n_labels; i++)
2273 pool_free (r->pool, utf8_labels[i]);
2274 pool_free (r->pool, utf8_labels);
2278 parse_value_labels (struct sfm_reader *r, struct dictionary *dict)
2280 int n_label_warnings = 0;
2281 for (size_t i = 0; i < r->n_labels; i++)
2282 parse_one_value_label_set (r, dict, r->vars, r->n_vars, &r->labels[i],
2284 if (n_label_warnings > MAX_LABEL_WARNINGS)
2286 _("Suppressed %d additional warnings for value labels."),
2287 n_label_warnings - MAX_LABEL_WARNINGS);
2290 static struct variable *
2291 parse_weight_var (struct sfm_reader *r,
2292 const struct sfm_var_record *var_recs, size_t n_var_recs,
2295 off_t offset = 76; /* Offset to variable index in header. */
2297 if (idx < 1 || idx > n_var_recs)
2299 sys_warn (r, offset,
2300 _("Weight variable index %d not in valid range 1...%zu. "
2301 "Treating file as unweighted."),
2306 const struct sfm_var_record *rec = &var_recs[idx - 1];
2307 if (rec->var == NULL)
2309 sys_warn (r, offset,
2310 _("Weight variable index %d refers to long string "
2311 "continuation. Treating file as unweighted."), idx);
2315 struct variable *weight_var = rec->var;
2316 if (!var_is_numeric (weight_var))
2318 sys_warn (r, offset, _("Ignoring string variable `%s' set "
2319 "as weighting variable."),
2320 var_get_name (weight_var));
2327 /* Parses a set of custom attributes from TEXT into ATTRS.
2328 ATTRS may be a null pointer, in which case the attributes are
2329 read but discarded. */
2331 parse_attributes (struct sfm_reader *r, struct text_record *text,
2332 struct attrset *attrs)
2336 struct attribute *attr;
2340 /* Parse the key. */
2341 key = text_get_token (text, ss_cstr ("("), NULL);
2345 attr = attribute_create (key);
2346 for (index = 1; ; index++)
2348 /* Parse the value. */
2352 value = text_get_token (text, ss_cstr ("\n"), NULL);
2355 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2360 length = strlen (value);
2361 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2363 value[length - 1] = '\0';
2364 attribute_add_value (attr, value + 1);
2369 _("Attribute value %s[%d] is not quoted: %s."),
2371 attribute_add_value (attr, value);
2374 /* Was this the last value for this attribute? */
2375 if (text_match (text, ')'))
2378 if (attrs != NULL && attribute_get_n_values (attr) > 0)
2380 if (!attrset_try_add (attrs, attr))
2382 text_warn (r, text, _("Duplicate attribute %s."),
2383 attribute_get_name (attr));
2384 attribute_destroy (attr);
2388 attribute_destroy (attr);
2390 while (!text_match (text, '/'));
2393 /* Reads record type 7, subtype 17, which lists custom
2394 attributes on the data file. */
2396 parse_data_file_attributes (struct sfm_reader *r,
2397 const struct sfm_extension_record *record,
2398 struct dictionary *dict)
2400 struct text_record *text = open_text_record (r, record, true);
2401 parse_attributes (r, text, dict_get_attributes (dict));
2402 close_text_record (r, text);
2405 /* Parses record type 7, subtype 18, which lists custom
2406 attributes on individual variables. */
2408 parse_variable_attributes (struct sfm_reader *r,
2409 const struct sfm_extension_record *record,
2410 struct dictionary *dict)
2412 struct text_record *text;
2413 struct variable *var;
2415 text = open_text_record (r, record, true);
2416 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2417 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2418 close_text_record (r, text);
2422 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2424 size_t n_warnings = 0;
2427 for (i = 0; i < dict_get_n_vars (dict); i++)
2429 struct variable *var = dict_get_var (dict, i);
2430 struct attrset *attrs = var_get_attributes (var);
2431 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2432 if (attr != NULL && attribute_get_n_values (attr) > 0)
2434 int value = atoi (attribute_get_value (attr, 0));
2456 role = ROLE_PARTITION;
2465 if (n_warnings++ == 0)
2466 sys_warn (r, -1, _("Invalid role for variable %s."),
2467 var_get_name (var));
2470 var_set_role (var, role);
2475 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2480 check_overflow__ (const struct sfm_extension_record *record,
2481 size_t ofs, size_t length)
2483 size_t end = record->size * record->count;
2484 if (length >= end || ofs + length > end)
2490 check_overflow (struct sfm_reader *r,
2491 const struct sfm_extension_record *record,
2492 size_t ofs, size_t length)
2494 bool ok = check_overflow__ (record, ofs, length);
2496 sys_warn (r, record->pos + record->size * record->count,
2497 _("Extension record subtype %d ends unexpectedly."),
2503 parse_long_string_value_labels (struct sfm_reader *r,
2504 const struct sfm_extension_record *record,
2505 struct dictionary *dict)
2507 const char *dict_encoding = dict_get_encoding (dict);
2508 size_t end = record->size * record->count;
2515 struct variable *var;
2520 /* Parse variable name length. */
2521 if (!check_overflow (r, record, ofs, 4))
2523 var_name_len = parse_int (r, record->data, ofs);
2526 /* Parse variable name, width, and number of labels. */
2527 if (!check_overflow (r, record, ofs, var_name_len)
2528 || !check_overflow (r, record, ofs, var_name_len + 8))
2530 var_name = recode_string_pool ("UTF-8", dict_encoding,
2531 (const char *) record->data + ofs,
2532 var_name_len, r->pool);
2533 width = parse_int (r, record->data, ofs + var_name_len);
2534 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2535 ofs += var_name_len + 8;
2537 /* Look up 'var' and validate. */
2538 var = dict_lookup_var (dict, var_name);
2540 sys_warn (r, record->pos + ofs,
2541 _("Ignoring long string value label record for "
2542 "unknown variable %s."), var_name);
2543 else if (var_is_numeric (var))
2545 sys_warn (r, record->pos + ofs,
2546 _("Ignoring long string value label record for "
2547 "numeric variable %s."), var_name);
2550 else if (width != var_get_width (var))
2552 sys_warn (r, record->pos + ofs,
2553 _("Ignoring long string value label record for variable "
2554 "%s because the record's width (%d) does not match the "
2555 "variable's width (%d)."),
2556 var_name, width, var_get_width (var));
2561 value_init_pool (r->pool, &value, width);
2562 for (i = 0; i < n_labels; i++)
2564 size_t value_length, label_length;
2565 bool skip = var == NULL;
2567 /* Parse value length. */
2568 if (!check_overflow (r, record, ofs, 4))
2570 value_length = parse_int (r, record->data, ofs);
2574 if (!check_overflow (r, record, ofs, value_length))
2578 if (value_length == width)
2579 memcpy (value.s, (const uint8_t *) record->data + ofs, width);
2582 sys_warn (r, record->pos + ofs,
2583 _("Ignoring long string value label %zu for "
2584 "variable %s, with width %d, that has bad value "
2586 i, var_get_name (var), width, value_length);
2590 ofs += value_length;
2592 /* Parse label length. */
2593 if (!check_overflow (r, record, ofs, 4))
2595 label_length = parse_int (r, record->data, ofs);
2599 if (!check_overflow (r, record, ofs, label_length))
2605 label = recode_string_pool ("UTF-8", dict_encoding,
2606 (const char *) record->data + ofs,
2607 label_length, r->pool);
2608 if (!var_add_value_label (var, &value, label))
2609 sys_warn (r, record->pos + ofs,
2610 _("Duplicate value label for `%.*s' on %s."),
2611 width, value.s, var_get_name (var));
2612 pool_free (r->pool, label);
2614 ofs += label_length;
2620 parse_long_string_missing_values (struct sfm_reader *r,
2621 const struct sfm_extension_record *record,
2622 struct dictionary *dict)
2624 const char *dict_encoding = dict_get_encoding (dict);
2625 size_t end = record->size * record->count;
2628 bool warned = false;
2631 struct missing_values mv;
2633 struct variable *var;
2634 int n_missing_values;
2638 /* Parse variable name length. */
2639 if (!check_overflow (r, record, ofs, 4))
2641 var_name_len = parse_int (r, record->data, ofs);
2644 /* Parse variable name. */
2645 if (!check_overflow (r, record, ofs, var_name_len)
2646 || !check_overflow (r, record, ofs, var_name_len + 1))
2648 var_name = recode_string_pool ("UTF-8", dict_encoding,
2649 (const char *) record->data + ofs,
2650 var_name_len, r->pool);
2651 ofs += var_name_len;
2653 /* Parse number of missing values. */
2654 n_missing_values = ((const uint8_t *) record->data)[ofs];
2655 if (n_missing_values < 1 || n_missing_values > 3)
2656 sys_warn (r, record->pos + ofs,
2657 _("Long string missing values record says variable %s "
2658 "has %d missing values, but only 1 to 3 missing values "
2660 var_name, n_missing_values);
2663 /* Look up 'var' and validate. */
2664 var = dict_lookup_var (dict, var_name);
2666 sys_warn (r, record->pos + ofs,
2667 _("Ignoring long string missing value record for "
2668 "unknown variable %s."), var_name);
2669 else if (var_is_numeric (var))
2671 sys_warn (r, record->pos + ofs,
2672 _("Ignoring long string missing value record for "
2673 "numeric variable %s."), var_name);
2677 /* Parse value length. */
2678 if (!check_overflow (r, record, ofs, 4))
2680 size_t value_length = parse_int (r, record->data, ofs);
2684 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2685 for (i = 0; i < n_missing_values; i++)
2687 /* Tolerate files written by old, buggy versions of PSPP where we
2688 believed that the value_length was repeated before each missing
2690 if (check_overflow__ (record, ofs, value_length)
2691 && parse_int (r, record->data, ofs) == 8)
2695 sys_warn (r, record->pos + ofs,
2696 _("This file has corrupted metadata written by a "
2697 "buggy version of PSPP. To fix it, save a new "
2698 "copy of the file."));
2705 if (!check_overflow (r, record, ofs, value_length))
2709 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2711 sys_warn (r, record->pos + ofs,
2712 _("Ignoring long string missing value %zu for variable "
2713 "%s, with width %d, that has bad value width %zu."),
2714 i, var_get_name (var), var_get_width (var),
2716 ofs += value_length;
2719 var_set_missing_values (var, &mv);
2725 static void partial_record (struct sfm_reader *);
2727 static void read_error (struct casereader *, const struct sfm_reader *);
2729 static bool read_case_number (struct sfm_reader *, double *);
2730 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2731 static int read_opcode (struct sfm_reader *);
2732 static bool read_compressed_number (struct sfm_reader *, double *);
2733 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2734 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2735 static bool skip_whole_strings (struct sfm_reader *, size_t);
2737 /* Reads and returns one case from READER's file. Returns a null
2738 pointer if not successful. */
2739 static struct ccase *
2740 sys_file_casereader_read (struct casereader *reader, void *r_)
2742 struct sfm_reader *r = r_;
2747 if (r->error || !r->sfm_n_vars)
2750 c = case_create (r->proto);
2752 for (i = 0; i < r->sfm_n_vars; i++)
2754 struct sfm_var *sv = &r->sfm_vars[i];
2755 union value *v = case_data_rw_idx (c, sv->case_index);
2757 if (sv->var_width == 0)
2758 retval = read_case_number (r, &v->f);
2761 retval = read_case_string (r, v->s + sv->offset, sv->segment_width);
2764 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2766 sys_error (r, r->pos, _("File ends in partial string value."));
2778 if (r->n_cases != -1)
2779 read_error (reader, r);
2784 /* Issues an error that R ends in a partial record. */
2786 partial_record (struct sfm_reader *r)
2788 sys_error (r, r->pos, _("File ends in partial case."));
2791 /* Issues an error that an unspecified error occurred SFM, and
2794 read_error (struct casereader *r, const struct sfm_reader *sfm)
2796 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2797 casereader_force_error (r);
2800 /* Reads a number from R and stores its value in *D.
2801 If R is compressed, reads a compressed number;
2802 otherwise, reads a number in the regular way.
2803 Returns true if successful, false if end of file is
2804 reached immediately. */
2806 read_case_number (struct sfm_reader *r, double *d)
2808 if (r->compression == ANY_COMP_NONE)
2811 if (!try_read_bytes (r, number, sizeof number))
2813 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2817 return read_compressed_number (r, d);
2820 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2821 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2822 discarded without being written to S. Reads compressed strings if S is
2823 compressed. Returns 1 if successful, 0 if end of file is reached
2824 immediately, or -1 for some kind of error. */
2826 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2828 size_t whole = ROUND_DOWN (length, 8);
2829 size_t partial = length % 8;
2833 int retval = read_whole_strings (r, s, whole);
2841 int retval = read_whole_strings (r, bounce, sizeof bounce);
2853 memcpy (s + whole, bounce, partial);
2859 /* Reads and returns the next compression opcode from R. */
2861 read_opcode (struct sfm_reader *r)
2863 assert (r->compression != ANY_COMP_NONE);
2867 if (r->opcode_idx >= sizeof r->opcodes)
2870 int retval = try_read_compressed_bytes (r, r->opcodes,
2876 opcode = r->opcodes[r->opcode_idx++];
2883 /* Reads a compressed number from R and stores its value in D.
2884 Returns true if successful, false if end of file is
2885 reached immediately. */
2887 read_compressed_number (struct sfm_reader *r, double *d)
2889 int opcode = read_opcode (r);
2897 return read_compressed_float (r, d);
2900 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2901 if (!r->corruption_warning)
2903 r->corruption_warning = true;
2904 sys_warn (r, r->pos,
2905 _("Possible compressed data corruption: "
2906 "compressed spaces appear in numeric field."));
2915 *d = opcode - r->bias;
2922 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2924 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2929 opcode = read_opcode (r);
2937 retval = read_compressed_bytes (r, dst, 8);
2938 return retval == 1 ? 1 : -1;
2941 memset (dst, ' ', 8);
2946 double value = opcode - r->bias;
2947 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2950 /* This has actually been seen "in the wild". The submitter of the
2951 file that showed that the contents decoded as spaces, but they
2952 were at the end of the field so it's possible that the null
2953 bytes just acted as null terminators. */
2955 else if (!r->corruption_warning)
2957 r->corruption_warning = true;
2958 sys_warn (r, r->pos,
2959 _("Possible compressed data corruption: "
2960 "string contains compressed integer (opcode %d)."),
2968 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2969 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2970 end of file is reached immediately, or -1 for some kind of error. */
2972 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2974 assert (length % 8 == 0);
2975 if (r->compression == ANY_COMP_NONE)
2976 return try_read_bytes (r, s, length);
2981 for (ofs = 0; ofs < length; ofs += 8)
2983 int retval = read_compressed_string (r, s + ofs);
2998 /* Skips LENGTH string bytes from R.
2999 LENGTH must be a multiple of 8.
3000 (LENGTH is also limited to 1024, but that's only because the
3001 current caller never needs more than that many bytes.)
3002 Returns true if successful, false if end of file is
3003 reached immediately. */
3005 skip_whole_strings (struct sfm_reader *r, size_t length)
3007 uint8_t buffer[1024];
3008 assert (length < sizeof buffer);
3009 return read_whole_strings (r, buffer, length);
3012 /* Helpers for reading records that contain structured text
3015 /* Maximum number of warnings to issue for a single text
3017 #define MAX_TEXT_WARNINGS 5
3022 struct substring buffer; /* Record contents. */
3023 off_t start; /* Starting offset in file. */
3024 size_t pos; /* Current position in buffer. */
3025 int n_warnings; /* Number of warnings issued or suppressed. */
3026 bool recoded; /* Recoded into UTF-8? */
3029 static struct text_record *
3030 open_text_record (struct sfm_reader *r,
3031 const struct sfm_extension_record *record,
3032 bool recode_to_utf8)
3034 struct text_record *text;
3035 struct substring raw;
3037 text = pool_alloc (r->pool, sizeof *text);
3038 raw = ss_buffer (record->data, record->size * record->count);
3039 text->start = record->pos;
3040 text->buffer = (recode_to_utf8
3041 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
3044 text->n_warnings = 0;
3045 text->recoded = recode_to_utf8;
3050 /* Closes TEXT, frees its storage, and issues a final warning
3051 about suppressed warnings if necessary. */
3053 close_text_record (struct sfm_reader *r, struct text_record *text)
3055 if (text->n_warnings > MAX_TEXT_WARNINGS)
3056 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
3057 text->n_warnings - MAX_TEXT_WARNINGS);
3059 pool_free (r->pool, ss_data (text->buffer));
3062 /* Reads a variable=value pair from TEXT.
3063 Looks up the variable in DICT and stores it into *VAR.
3064 Stores a null-terminated value into *VALUE. */
3066 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
3067 struct text_record *text,
3068 struct variable **var, char **value)
3072 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
3075 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
3079 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
3080 ss_buffer ("\t\0", 2));
3088 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
3089 struct text_record *text, struct substring delimiters,
3090 struct variable **var)
3094 name = text_get_token (text, delimiters, NULL);
3098 *var = dict_lookup_var (dict, name);
3102 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3109 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3110 struct text_record *text, struct substring delimiters,
3111 struct variable **var)
3113 char *short_name = text_get_token (text, delimiters, NULL);
3114 if (short_name == NULL)
3117 *var = dict_lookup_var (dict, short_name);
3119 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3124 /* Displays a warning for the current file position, limiting the
3125 number to MAX_TEXT_WARNINGS for TEXT. */
3127 text_warn (struct sfm_reader *r, struct text_record *text,
3128 const char *format, ...)
3130 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3134 va_start (args, format);
3135 sys_msg (r, text->start + text->pos, MW, format, args);
3141 text_get_token (struct text_record *text, struct substring delimiters,
3144 struct substring token;
3147 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3149 if (delimiter != NULL)
3150 *delimiter = ss_data (text->buffer)[text->pos-1];
3154 end = &ss_data (token)[ss_length (token)];
3155 if (delimiter != NULL)
3158 return ss_data (token);
3161 /* Reads a integer value expressed in decimal, then a space, then a string that
3162 consists of exactly as many bytes as specified by the integer, then a space,
3163 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3164 buffer (so the caller should not free the string). */
3166 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3174 while (text->pos < text->buffer.length)
3176 int c = text->buffer.string[text->pos];
3177 if (c < '0' || c > '9')
3179 n = (n * 10) + (c - '0');
3182 if (text->pos >= text->buffer.length || start == text->pos)
3184 sys_warn (r, text->start,
3185 _("Expecting digit at offset %zu in MRSETS record."),
3190 if (!text_match (text, ' '))
3192 sys_warn (r, text->start,
3193 _("Expecting space at offset %zu in MRSETS record."),
3198 if (text->pos + n > text->buffer.length)
3200 sys_warn (r, text->start,
3201 _("%zu-byte string starting at offset %zu "
3202 "exceeds record length %zu."),
3203 n, text->pos, text->buffer.length);
3207 s = &text->buffer.string[text->pos];
3210 sys_warn (r, text->start,
3211 _("Expecting space at offset %zu following %zu-byte string."),
3221 text_match (struct text_record *text, char c)
3223 if (text->pos >= text->buffer.length)
3226 if (text->buffer.string[text->pos] == c)
3235 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3236 inside the TEXT's string. */
3238 text_pos (const struct text_record *text)
3244 text_get_all (const struct text_record *text)
3246 return text->buffer.string;
3251 /* Displays a corruption message. */
3253 sys_msg (struct sfm_reader *r, off_t offset,
3254 int class, const char *format, va_list args)
3258 ds_init_empty (&text);
3260 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3261 fh_get_file_name (r->fh), (long long int) offset);
3263 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3264 ds_put_vformat (&text, format, args);
3266 struct msg *m = xmalloc (sizeof *m);
3268 .category = msg_class_to_category (class),
3269 .severity = msg_class_to_severity (class),
3270 .text = ds_steal_cstr (&text),
3275 /* Displays a warning for offset OFFSET in the file. */
3277 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3281 va_start (args, format);
3282 sys_msg (r, offset, MW, format, args);
3286 /* Displays an error for the current file position and marks it as in an error
3289 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3293 va_start (args, format);
3294 sys_msg (r, offset, ME, format, args);
3300 /* Reads BYTE_CNT bytes into BUF.
3301 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3302 Returns -1 if an I/O error or a partial read occurs.
3303 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3306 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3307 void *buf, size_t n_bytes)
3309 size_t bytes_read = fread (buf, 1, n_bytes, r->file);
3310 r->pos += bytes_read;
3311 if (bytes_read == n_bytes)
3313 else if (ferror (r->file))
3315 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3318 else if (!eof_is_ok || bytes_read != 0)
3320 sys_error (r, r->pos, _("Unexpected end of file."));
3327 /* Reads BYTE_CNT into BUF.
3328 Returns true if successful.
3329 Returns false upon I/O error or if end-of-file is encountered. */
3331 read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3333 return read_bytes_internal (r, false, buf, n_bytes) == 1;
3336 /* Reads BYTE_CNT bytes into BUF.
3337 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3338 Returns 0 if an immediate end-of-file is encountered.
3339 Returns -1 if an I/O error or a partial read occurs. */
3341 try_read_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3343 return read_bytes_internal (r, true, buf, n_bytes);
3346 /* Reads a 32-bit signed integer from R and stores its value in host format in
3347 *X. Returns true if successful, otherwise false. */
3349 read_int (struct sfm_reader *r, int *x)
3352 if (read_bytes (r, integer, sizeof integer) != 1)
3354 *x = integer_get (r->integer_format, integer, sizeof integer);
3359 read_uint (struct sfm_reader *r, unsigned int *x)
3364 ok = read_int (r, &y);
3369 /* Reads a 64-bit signed integer from R and returns its value in
3372 read_int64 (struct sfm_reader *r, long long int *x)
3375 if (read_bytes (r, integer, sizeof integer) != 1)
3377 *x = integer_get (r->integer_format, integer, sizeof integer);
3381 /* Reads a 64-bit signed integer from R and returns its value in
3384 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3389 ok = read_int64 (r, &y);
3395 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3397 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3401 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3403 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3406 /* Reads exactly SIZE - 1 bytes into BUFFER
3407 and stores a null byte into BUFFER[SIZE - 1]. */
3409 read_string (struct sfm_reader *r, char *buffer, size_t size)
3414 ok = read_bytes (r, buffer, size - 1);
3416 buffer[size - 1] = '\0';
3420 /* Skips BYTES bytes forward in R. */
3422 skip_bytes (struct sfm_reader *r, size_t bytes)
3427 size_t chunk = MIN (sizeof buffer, bytes);
3428 if (!read_bytes (r, buffer, chunk))
3436 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3437 been replaced by LFs.
3439 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3440 files that use CR-only line ends in the file label and extra product
3443 fix_line_ends (const char *s)
3447 d = dst = xmalloc (strlen (s) + 1);
3466 read_ztrailer (struct sfm_reader *r,
3467 long long int zheader_ofs,
3468 long long int ztrailer_len);
3471 zalloc (voidpf pool_, uInt items, uInt size)
3473 struct pool *pool = pool_;
3475 return (!size || xalloc_oversized (items, size)
3477 : pool_malloc (pool, items * size));
3481 zfree (voidpf pool_, voidpf address)
3483 struct pool *pool = pool_;
3485 pool_free (pool, address);
3489 read_zheader (struct sfm_reader *r)
3492 long long int zheader_ofs;
3493 long long int ztrailer_ofs;
3494 long long int ztrailer_len;
3496 if (!read_int64 (r, &zheader_ofs)
3497 || !read_int64 (r, &ztrailer_ofs)
3498 || !read_int64 (r, &ztrailer_len))
3501 if (zheader_ofs != pos)
3503 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3504 "(expected %#llx)."),
3505 zheader_ofs, (long long int) pos);
3509 if (ztrailer_ofs < r->pos)
3511 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3516 if (ztrailer_len < 24 || ztrailer_len % 24)
3518 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3522 r->ztrailer_ofs = ztrailer_ofs;
3523 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3526 if (r->zin_buf == NULL)
3528 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3529 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3530 r->zstream.next_in = NULL;
3531 r->zstream.avail_in = 0;
3534 r->zstream.zalloc = zalloc;
3535 r->zstream.zfree = zfree;
3536 r->zstream.opaque = r->pool;
3538 return open_zstream (r);
3542 seek (struct sfm_reader *r, off_t offset)
3544 if (fseeko (r->file, offset, SEEK_SET))
3545 sys_error (r, 0, _("%s: seek failed (%s)."),
3546 fh_get_file_name (r->fh), strerror (errno));
3550 /* Performs some additional consistency checks on the ZLIB compressed data
3553 read_ztrailer (struct sfm_reader *r,
3554 long long int zheader_ofs,
3555 long long int ztrailer_len)
3557 long long int expected_uncmp_ofs;
3558 long long int expected_cmp_ofs;
3561 unsigned int block_size;
3562 unsigned int n_blocks;
3566 if (fstat (fileno (r->file), &s))
3568 sys_error (r, 0, _("%s: stat failed (%s)."),
3569 fh_get_file_name (r->fh), strerror (errno));
3573 if (!S_ISREG (s.st_mode))
3575 /* We can't seek to the trailer and then back to the data in this file,
3576 so skip doing extra checks. */
3580 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3581 sys_warn (r, r->pos,
3582 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3583 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3585 seek (r, r->ztrailer_ofs);
3587 /* Read fixed header from ZLIB data trailer. */
3588 if (!read_int64 (r, &bias))
3590 if (-bias != r->bias)
3592 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3593 "file header bias (%.2f)."),
3598 if (!read_int64 (r, &zero))
3601 sys_warn (r, r->pos,
3602 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3604 if (!read_uint (r, &block_size))
3606 if (block_size != ZBLOCK_SIZE)
3607 sys_warn (r, r->pos,
3608 _("ZLIB trailer specifies unexpected %u-byte block size."),
3611 if (!read_uint (r, &n_blocks))
3613 if (n_blocks != (ztrailer_len - 24) / 24)
3615 sys_error (r, r->pos,
3616 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3618 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3622 expected_uncmp_ofs = zheader_ofs;
3623 expected_cmp_ofs = zheader_ofs + 24;
3624 for (i = 0; i < n_blocks; i++)
3626 off_t desc_ofs = r->pos;
3627 unsigned long long int uncompressed_ofs;
3628 unsigned long long int compressed_ofs;
3629 unsigned int uncompressed_size;
3630 unsigned int compressed_size;
3632 if (!read_uint64 (r, &uncompressed_ofs)
3633 || !read_uint64 (r, &compressed_ofs)
3634 || !read_uint (r, &uncompressed_size)
3635 || !read_uint (r, &compressed_size))
3638 if (uncompressed_ofs != expected_uncmp_ofs)
3640 sys_error (r, desc_ofs,
3641 _("ZLIB block descriptor %u reported uncompressed data "
3642 "offset %#llx, when %#llx was expected."),
3643 i, uncompressed_ofs, expected_uncmp_ofs);
3647 if (compressed_ofs != expected_cmp_ofs)
3649 sys_error (r, desc_ofs,
3650 _("ZLIB block descriptor %u reported compressed data "
3651 "offset %#llx, when %#llx was expected."),
3652 i, compressed_ofs, expected_cmp_ofs);
3656 if (i < n_blocks - 1)
3658 if (uncompressed_size != block_size)
3659 sys_warn (r, desc_ofs,
3660 _("ZLIB block descriptor %u reported block size %#x, "
3661 "when %#x was expected."),
3662 i, uncompressed_size, block_size);
3666 if (uncompressed_size > block_size)
3667 sys_warn (r, desc_ofs,
3668 _("ZLIB block descriptor %u reported block size %#x, "
3669 "when at most %#x was expected."),
3670 i, uncompressed_size, block_size);
3673 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3674 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3675 This code checks for an expansion of more than 14.3% plus 11
3677 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3679 sys_error (r, desc_ofs,
3680 _("ZLIB block descriptor %u reports compressed size %u "
3681 "and uncompressed size %u."),
3682 i, compressed_size, uncompressed_size);
3686 expected_uncmp_ofs += uncompressed_size;
3687 expected_cmp_ofs += compressed_size;
3690 if (expected_cmp_ofs != r->ztrailer_ofs)
3692 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3693 "would be expected from block descriptors."),
3694 r->ztrailer_ofs, expected_cmp_ofs);
3698 seek (r, zheader_ofs + 24);
3703 open_zstream (struct sfm_reader *r)
3707 r->zout_pos = r->zout_end = 0;
3708 error = inflateInit (&r->zstream);
3711 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3719 close_zstream (struct sfm_reader *r)
3723 error = inflateEnd (&r->zstream);
3726 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3734 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t n_bytes)
3736 uint8_t *buf = buf_;
3745 /* Use already inflated data if there is any. */
3746 if (r->zout_pos < r->zout_end)
3748 unsigned int n = MIN (n_bytes, r->zout_end - r->zout_pos);
3749 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3758 /* We need to inflate some more data.
3759 Get some more input data if we don't have any. */
3760 if (r->zstream.avail_in == 0)
3762 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3767 int retval = try_read_bytes (r, r->zin_buf, n);
3770 r->zstream.avail_in = n;
3771 r->zstream.next_in = r->zin_buf;
3775 /* Inflate the (remaining) input data. */
3776 r->zstream.avail_out = ZOUT_BUF_SIZE;
3777 r->zstream.next_out = r->zout_buf;
3778 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3780 r->zout_end = r->zstream.next_out - r->zout_buf;
3781 if (r->zout_end == 0)
3783 if (error != Z_STREAM_END)
3785 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3789 else if (!close_zstream (r) || !open_zstream (r))
3794 /* Process the output data and ignore 'error' for now. ZLIB will
3795 present it to us again on the next inflate() call. */
3801 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3803 if (r->compression == ANY_COMP_SIMPLE)
3804 return read_bytes (r, buf, n_bytes);
3807 int retval = read_bytes_zlib (r, buf, n_bytes);
3809 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3815 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t n_bytes)
3817 if (r->compression == ANY_COMP_SIMPLE)
3818 return try_read_bytes (r, buf, n_bytes);
3820 return read_bytes_zlib (r, buf, n_bytes);
3823 /* Reads a 64-bit floating-point number from R and returns its
3824 value in host format. */
3826 read_compressed_float (struct sfm_reader *r, double *d)
3830 if (!read_compressed_bytes (r, number, sizeof number))
3833 *d = float_get_double (r->float_format, number);
3837 static const struct casereader_class sys_file_casereader_class =
3839 sys_file_casereader_read,
3840 sys_file_casereader_destroy,
3845 const struct any_reader_class sys_file_reader_class =
3847 N_("SPSS System File"),