1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-private.h"
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/message.h"
49 #include "libpspp/misc.h"
50 #include "libpspp/pool.h"
51 #include "libpspp/str.h"
52 #include "libpspp/stringi-set.h"
54 #include "gl/c-strtod.h"
55 #include "gl/c-ctype.h"
56 #include "gl/inttostr.h"
57 #include "gl/localcharset.h"
58 #include "gl/minmax.h"
59 #include "gl/unlocked-io.h"
60 #include "gl/xalloc.h"
61 #include "gl/xalloc-oversized.h"
65 #define _(msgid) gettext (msgid)
66 #define N_(msgid) (msgid)
70 /* subtypes 0-2 unknown */
71 EXT_INTEGER = 3, /* Machine integer info. */
72 EXT_FLOAT = 4, /* Machine floating-point info. */
73 EXT_VAR_SETS = 5, /* Variable sets. */
74 EXT_DATE = 6, /* DATE. */
75 EXT_MRSETS = 7, /* Multiple response sets. */
76 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
77 /* subtype 9 unknown */
78 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
79 EXT_DISPLAY = 11, /* Variable display parameters. */
80 /* subtype 12 unknown */
81 EXT_LONG_NAMES = 13, /* Long variable names. */
82 EXT_LONG_STRINGS = 14, /* Long strings. */
83 /* subtype 15 unknown */
84 EXT_NCASES = 16, /* Extended number of cases. */
85 EXT_FILE_ATTRS = 17, /* Data file attributes. */
86 EXT_VAR_ATTRS = 18, /* Variable attributes. */
87 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
88 EXT_ENCODING = 20, /* Character encoding. */
89 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
90 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
91 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
94 /* Fields from the top-level header record. */
95 struct sfm_header_record
97 char magic[5]; /* First 4 bytes of file, then null. */
98 int weight_idx; /* 0 if unweighted, otherwise a var index. */
99 int nominal_case_size; /* Number of var positions. */
101 /* These correspond to the members of struct any_file_info or a dictionary
102 but in the system file's encoding rather than ASCII. */
103 char creation_date[10]; /* "dd mmm yy". */
104 char creation_time[9]; /* "hh:mm:ss". */
105 char eye_catcher[61]; /* Eye-catcher string, then product name. */
106 char file_label[65]; /* File label. */
109 struct sfm_var_record
116 int missing_value_code;
119 struct variable *var;
122 struct sfm_value_label
128 struct sfm_value_label_record
131 struct sfm_value_label *labels;
132 unsigned int n_labels;
138 struct sfm_document_record
147 const char *name; /* Name. */
148 const char *label; /* Human-readable label for group. */
149 enum mrset_type type; /* Group type. */
150 const char **vars; /* Constituent variables' names. */
151 size_t n_vars; /* Number of constituent variables. */
154 enum mrset_md_cat_source cat_source; /* Source of category labels. */
155 bool label_from_var_label; /* 'label' taken from variable label? */
156 const char *counted; /* Counted value, as string. */
159 struct sfm_extension_record
161 int subtype; /* Record subtype. */
162 off_t pos; /* Starting offset in file. */
163 unsigned int size; /* Size of data elements. */
164 unsigned int count; /* Number of data elements. */
165 void *data; /* Contents. */
168 /* System file reader. */
171 struct any_reader any_reader;
173 /* Resource tracking. */
174 struct pool *pool; /* All system file state. */
177 struct any_read_info info;
178 struct sfm_header_record header;
179 struct sfm_var_record *vars;
181 struct sfm_value_label_record *labels;
183 struct sfm_document_record *document;
184 struct sfm_mrset *mrsets;
186 struct sfm_extension_record *extensions[32];
189 struct file_handle *fh; /* File handle. */
190 struct fh_lock *lock; /* Mutual exclusion for file handle. */
191 FILE *file; /* File stream. */
192 off_t pos; /* Position in file. */
193 bool error; /* I/O or corruption error? */
194 struct caseproto *proto; /* Format of output cases. */
197 enum integer_format integer_format; /* On-disk integer format. */
198 enum float_format float_format; /* On-disk floating point format. */
199 struct sfm_var *sfm_vars; /* Variables. */
200 size_t sfm_var_cnt; /* Number of variables. */
201 int case_cnt; /* Number of cases */
202 const char *encoding; /* String encoding. */
205 enum any_compression compression;
206 double bias; /* Compression bias, usually 100.0. */
207 uint8_t opcodes[8]; /* Current block of opcodes. */
208 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
209 bool corruption_warning; /* Warned about possible corruption? */
211 /* ZLIB decompression. */
212 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
213 #define ZIN_BUF_SIZE 4096
214 uint8_t *zin_buf; /* Inflation input buffer. */
215 #define ZOUT_BUF_SIZE 16384
216 uint8_t *zout_buf; /* Inflation output buffer. */
217 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
218 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
219 z_stream zstream; /* ZLIB inflater. */
222 static const struct casereader_class sys_file_casereader_class;
224 static struct sfm_reader *
225 sfm_reader_cast (const struct any_reader *r_)
227 assert (r_->klass == &sys_file_reader_class);
228 return UP_CAST (r_, struct sfm_reader, any_reader);
231 static bool sfm_close (struct any_reader *);
233 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
234 const struct sfm_var_record *,
237 static void sys_msg (struct sfm_reader *r, off_t, int class,
238 const char *format, va_list args)
239 PRINTF_FORMAT (4, 0);
240 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
241 PRINTF_FORMAT (3, 4);
242 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
243 PRINTF_FORMAT (3, 4);
245 static bool read_bytes (struct sfm_reader *, void *, size_t)
247 static int try_read_bytes (struct sfm_reader *, void *, size_t)
249 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
250 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
251 static bool read_int64 (struct sfm_reader *, long long int *)
253 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
255 static bool read_string (struct sfm_reader *, char *, size_t)
257 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
259 /* ZLIB compressed data handling. */
260 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
261 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
262 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
263 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
265 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
267 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
269 static bool read_compressed_float (struct sfm_reader *, double *)
272 static char *fix_line_ends (const char *);
274 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
275 static double parse_float (const struct sfm_reader *,
276 const void *data, size_t ofs);
278 static bool read_variable_record (struct sfm_reader *,
279 struct sfm_var_record *);
280 static bool read_value_label_record (struct sfm_reader *,
281 struct sfm_value_label_record *);
282 static struct sfm_document_record *read_document_record (struct sfm_reader *);
283 static bool read_extension_record (struct sfm_reader *, int subtype,
284 struct sfm_extension_record **);
285 static bool skip_extension_record (struct sfm_reader *, int subtype);
287 static struct text_record *open_text_record (
288 struct sfm_reader *, const struct sfm_extension_record *,
289 bool recode_to_utf8);
290 static void close_text_record (struct sfm_reader *,
291 struct text_record *);
292 static bool read_variable_to_value_pair (struct sfm_reader *,
294 struct text_record *,
295 struct variable **var, char **value);
296 static void text_warn (struct sfm_reader *r, struct text_record *text,
297 const char *format, ...)
298 PRINTF_FORMAT (3, 4);
299 static char *text_get_token (struct text_record *,
300 struct substring delimiters, char *delimiter);
301 static bool text_match (struct text_record *, char c);
302 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
303 struct text_record *,
304 struct substring delimiters,
306 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
307 struct text_record *,
308 struct substring delimiters,
310 static const char *text_parse_counted_string (struct sfm_reader *,
311 struct text_record *);
312 static size_t text_pos (const struct text_record *);
313 static const char *text_get_all (const struct text_record *);
315 /* Dictionary reader. */
323 static bool read_dictionary (struct sfm_reader *);
324 static bool read_record (struct sfm_reader *, int type,
325 size_t *allocated_vars, size_t *allocated_labels);
326 static bool read_header (struct sfm_reader *, struct any_read_info *,
327 struct sfm_header_record *);
328 static void parse_header (struct sfm_reader *,
329 const struct sfm_header_record *,
330 struct any_read_info *, struct dictionary *);
331 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
332 struct sfm_var_record *, size_t n);
333 static void parse_format_spec (struct sfm_reader *, off_t pos,
334 unsigned int format, enum which_format,
335 struct variable *, int *format_warning_cnt);
336 static void parse_document (struct dictionary *, struct sfm_document_record *);
337 static void parse_display_parameters (struct sfm_reader *,
338 const struct sfm_extension_record *,
339 struct dictionary *);
340 static bool parse_machine_integer_info (struct sfm_reader *,
341 const struct sfm_extension_record *,
342 struct any_read_info *);
343 static void parse_machine_float_info (struct sfm_reader *,
344 const struct sfm_extension_record *);
345 static void parse_extra_product_info (struct sfm_reader *,
346 const struct sfm_extension_record *,
347 struct any_read_info *);
348 static void parse_mrsets (struct sfm_reader *,
349 const struct sfm_extension_record *,
350 size_t *allocated_mrsets);
351 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
352 static void parse_long_var_name_map (struct sfm_reader *,
353 const struct sfm_extension_record *,
354 struct dictionary *);
355 static bool parse_long_string_map (struct sfm_reader *,
356 const struct sfm_extension_record *,
357 struct dictionary *);
358 static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
359 const struct sfm_var_record *,
361 const struct sfm_value_label_record *);
362 static void parse_data_file_attributes (struct sfm_reader *,
363 const struct sfm_extension_record *,
364 struct dictionary *);
365 static void parse_variable_attributes (struct sfm_reader *,
366 const struct sfm_extension_record *,
367 struct dictionary *);
368 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
369 static void parse_long_string_value_labels (struct sfm_reader *,
370 const struct sfm_extension_record *,
371 struct dictionary *);
372 static void parse_long_string_missing_values (
373 struct sfm_reader *, const struct sfm_extension_record *,
374 struct dictionary *);
376 /* Frees the strings inside INFO. */
378 any_read_info_destroy (struct any_read_info *info)
382 free (info->creation_date);
383 free (info->creation_time);
384 free (info->product);
385 free (info->product_ext);
389 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
390 successful, otherwise NULL. */
391 static struct any_reader *
392 sfm_open (struct file_handle *fh)
394 size_t allocated_mrsets = 0;
395 struct sfm_reader *r;
397 /* Create and initialize reader. */
398 r = xzalloc (sizeof *r);
399 r->any_reader.klass = &sys_file_reader_class;
400 r->pool = pool_create ();
401 pool_register (r->pool, free, r);
403 r->opcode_idx = sizeof r->opcodes;
405 /* TRANSLATORS: this fragment will be interpolated into
406 messages in fh_lock() that identify types of files. */
407 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
411 r->file = fn_open (fh, "rb");
414 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
415 fh_get_file_name (r->fh), strerror (errno));
419 if (!read_dictionary (r))
422 if (r->extensions[EXT_MRSETS] != NULL)
423 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
425 if (r->extensions[EXT_MRSETS2] != NULL)
426 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
428 return &r->any_reader;
432 sfm_close (&r->any_reader);
437 read_dictionary (struct sfm_reader *r)
439 size_t allocated_vars;
440 size_t allocated_labels;
442 if (!read_header (r, &r->info, &r->header))
446 allocated_labels = 0;
451 if (!read_int (r, &type))
455 if (!read_record (r, type, &allocated_vars, &allocated_labels))
459 if (!skip_bytes (r, 4))
462 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
469 read_record (struct sfm_reader *r, int type,
470 size_t *allocated_vars, size_t *allocated_labels)
477 if (r->n_vars >= *allocated_vars)
478 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
480 return read_variable_record (r, &r->vars[r->n_vars++]);
483 if (r->n_labels >= *allocated_labels)
484 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
486 return read_value_label_record (r, &r->labels[r->n_labels++]);
489 /* A Type 4 record is always immediately after a type 3 record,
490 so the code for type 3 records reads the type 4 record too. */
491 sys_error (r, r->pos, _("Misplaced type 4 record."));
495 if (r->document != NULL)
497 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
500 r->document = read_document_record (r);
501 return r->document != NULL;
504 if (!read_int (r, &subtype))
507 || subtype >= sizeof r->extensions / sizeof *r->extensions)
510 _("Unrecognized record type 7, subtype %d. For help, "
511 "please send this file to %s and mention that you were "
513 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
514 return skip_extension_record (r, subtype);
516 else if (r->extensions[subtype] != NULL)
519 _("Record type 7, subtype %d found here has the same "
520 "type as the record found near offset 0x%llx. For "
521 "help, please send this file to %s and mention that "
522 "you were using %s."),
523 subtype, (long long int) r->extensions[subtype]->pos,
524 PACKAGE_BUGREPORT, PACKAGE_STRING);
525 return skip_extension_record (r, subtype);
528 return read_extension_record (r, subtype, &r->extensions[subtype]);
531 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
538 /* Returns the character encoding obtained from R, or a null pointer if R
539 doesn't have an indication of its character encoding. */
541 sfm_get_encoding (const struct sfm_reader *r)
543 /* The EXT_ENCODING record is the best way to determine dictionary
545 if (r->extensions[EXT_ENCODING])
546 return r->extensions[EXT_ENCODING]->data;
548 /* But EXT_INTEGER is better than nothing as a fallback. */
549 if (r->extensions[EXT_INTEGER])
551 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
552 const char *encoding;
561 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
562 respectively. However, many files have character code 2 but data
563 which are clearly not ASCII. Therefore, ignore these values. */
570 encoding = sys_get_encoding_from_codepage (codepage);
571 if (encoding != NULL)
577 /* If the file magic number is EBCDIC then its character data is too. */
578 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
584 struct get_strings_aux
595 add_string__ (struct get_strings_aux *aux,
596 const char *string, bool id, char *title)
598 if (aux->n >= aux->allocated)
600 aux->allocated = 2 * (aux->allocated + 1);
601 aux->titles = pool_realloc (aux->pool, aux->titles,
602 aux->allocated * sizeof *aux->titles);
603 aux->strings = pool_realloc (aux->pool, aux->strings,
604 aux->allocated * sizeof *aux->strings);
605 aux->ids = pool_realloc (aux->pool, aux->ids,
606 aux->allocated * sizeof *aux->ids);
609 aux->titles[aux->n] = title;
610 aux->strings[aux->n] = pool_strdup (aux->pool, string);
611 aux->ids[aux->n] = id;
615 static void PRINTF_FORMAT (3, 4)
616 add_string (struct get_strings_aux *aux,
617 const char *string, const char *title, ...)
621 va_start (args, title);
622 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
626 static void PRINTF_FORMAT (3, 4)
627 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
631 va_start (args, title);
632 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
636 /* Retrieves significant string data from R in its raw format, to allow the
637 caller to try to detect the encoding in use.
639 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
640 and *STRINGSP to an array of N elements allocated from POOL. For each I in
641 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
642 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
643 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
646 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
647 char ***titlesp, bool **idsp, char ***stringsp)
649 struct sfm_reader *r = sfm_reader_cast (r_);
650 const struct sfm_mrset *mrset;
651 struct get_strings_aux aux;
663 for (i = 0; i < r->n_vars; i++)
664 if (r->vars[i].width != -1)
665 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
668 for (i = 0; i < r->n_vars; i++)
669 if (r->vars[i].width != -1)
672 if (r->vars[i].label)
673 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
678 for (i = 0; i < r->n_labels; i++)
679 for (j = 0; j < r->labels[i].n_labels; j++)
680 add_string (&aux, r->labels[i].labels[j].label,
681 _("Value Label %zu"), k++);
683 add_string (&aux, r->header.creation_date, _("Creation Date"));
684 add_string (&aux, r->header.creation_time, _("Creation Time"));
685 add_string (&aux, r->header.eye_catcher, _("Product"));
686 add_string (&aux, r->header.file_label, _("File Label"));
688 if (r->extensions[EXT_PRODUCT_INFO])
689 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
690 _("Extra Product Info"));
696 for (i = 0; i < r->document->n_lines; i++)
700 memcpy (line, r->document->documents + i * 80, 80);
703 add_string (&aux, line, _("Document Line %zu"), i + 1);
707 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
709 size_t mrset_idx = mrset - r->mrsets + 1;
711 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
713 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
715 /* Skip the variables because they ought to be duplicates. */
718 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
723 /* data file attributes */
724 /* variable attributes */
726 /* long string value labels */
727 /* long string missing values */
729 *titlesp = aux.titles;
731 *stringsp = aux.strings;
735 /* Decodes the dictionary read from R, saving it into into *DICT. Character
736 strings in R are decoded using ENCODING, or an encoding obtained from R if
737 ENCODING is null, or the locale encoding if R specifies no encoding.
739 If INFOP is non-null, then it receives additional info about the system
740 file, which the caller must eventually free with any_read_info_destroy()
741 when it is no longer needed.
743 This function consumes R. The caller must use it again later, even to
744 destroy it with sfm_close(). */
745 static struct casereader *
746 sfm_decode (struct any_reader *r_, const char *encoding,
747 struct dictionary **dictp, struct any_read_info *infop)
749 struct sfm_reader *r = sfm_reader_cast (r_);
750 struct dictionary *dict;
753 if (encoding == NULL)
755 encoding = sfm_get_encoding (r);
756 if (encoding == NULL)
758 sys_warn (r, -1, _("This system file does not indicate its own "
759 "character encoding. Using default encoding "
760 "%s. For best results, specify an encoding "
761 "explicitly. Use SYSFILE INFO with "
762 "ENCODING=\"DETECT\" to analyze the possible "
765 encoding = locale_charset ();
769 dict = dict_create (encoding);
770 r->encoding = dict_get_encoding (dict);
772 /* These records don't use variables at all. */
773 if (r->document != NULL)
774 parse_document (dict, r->document);
776 if (r->extensions[EXT_INTEGER] != NULL
777 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
780 if (r->extensions[EXT_FLOAT] != NULL)
781 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
783 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
784 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
786 if (r->extensions[EXT_FILE_ATTRS] != NULL)
787 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
789 parse_header (r, &r->header, &r->info, dict);
791 /* Parse the variable records, the basis of almost everything else. */
792 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
795 /* Parse value labels and the weight variable immediately after the variable
796 records. These records use indexes into var_recs[], so we must parse them
797 before those indexes become invalidated by very long string variables. */
798 for (i = 0; i < r->n_labels; i++)
799 if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i]))
801 if (r->header.weight_idx != 0)
803 struct variable *weight_var;
805 weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars,
806 r->header.weight_idx);
807 if (weight_var != NULL)
809 if (var_is_numeric (weight_var))
810 dict_set_weight (dict, weight_var);
812 sys_warn (r, -1, _("Ignoring string variable `%s' set "
813 "as weighting variable."),
814 var_get_name (weight_var));
818 if (r->extensions[EXT_DISPLAY] != NULL)
819 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
821 /* The following records use short names, so they need to be parsed before
822 parse_long_var_name_map() changes short names to long names. */
823 decode_mrsets (r, dict);
825 if (r->extensions[EXT_LONG_STRINGS] != NULL
826 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
829 /* Now rename variables to their long names. */
830 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
832 /* The following records use long names, so they need to follow renaming. */
833 if (r->extensions[EXT_VAR_ATTRS] != NULL)
835 parse_variable_attributes (r, r->extensions[EXT_VAR_ATTRS], dict);
837 /* Roles use the $@Role attribute. */
838 assign_variable_roles (r, dict);
841 if (r->extensions[EXT_LONG_LABELS] != NULL)
842 parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
843 if (r->extensions[EXT_LONG_MISSING] != NULL)
844 parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
847 /* Warn if the actual amount of data per case differs from the
848 amount that the header claims. SPSS version 13 gets this
849 wrong when very long strings are involved, so don't warn in
851 if (r->header.nominal_case_size != -1
852 && r->header.nominal_case_size != r->n_vars
853 && r->info.version_major != 13)
854 sys_warn (r, -1, _("File header claims %d variable positions but "
855 "%zu were read from file."),
856 r->header.nominal_case_size, r->n_vars);
858 /* Create an index of dictionary variable widths for
859 sfm_read_case to use. We cannot use the `struct variable's
860 from the dictionary we created, because the caller owns the
861 dictionary and may destroy or modify its variables. */
862 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
863 pool_register (r->pool, free, r->sfm_vars);
864 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
870 memset (&r->info, 0, sizeof r->info);
873 return casereader_create_sequential
875 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
876 &sys_file_casereader_class, r);
885 /* Closes R, which should have been returned by sfm_open() but not already
886 closed with sfm_decode() or this function.
887 Returns true if an I/O error has occurred on READER, false
890 sfm_close (struct any_reader *r_)
892 struct sfm_reader *r = sfm_reader_cast (r_);
897 if (fn_close (r->fh, r->file) == EOF)
899 msg (ME, _("Error closing system file `%s': %s."),
900 fh_get_file_name (r->fh), strerror (errno));
906 any_read_info_destroy (&r->info);
911 pool_destroy (r->pool);
916 /* Destroys READER. */
918 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
920 struct sfm_reader *r = r_;
921 sfm_close (&r->any_reader);
924 /* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
925 a negative errno value if there is an error reading FILE. */
927 sfm_detect (FILE *file)
931 if (fseek (file, 0, SEEK_SET) != 0)
933 if (fread (magic, 4, 1, file) != 1)
934 return ferror (file) ? -errno : 0;
937 return (!strcmp (ASCII_MAGIC, magic)
938 || !strcmp (ASCII_ZMAGIC, magic)
939 || !strcmp (EBCDIC_MAGIC, magic));
942 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
943 except for the string fields in *INFO, which parse_header() will initialize
944 later once the file's encoding is known. */
946 read_header (struct sfm_reader *r, struct any_read_info *info,
947 struct sfm_header_record *header)
949 uint8_t raw_layout_code[4];
954 if (!read_string (r, header->magic, sizeof header->magic)
955 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
958 if (!strcmp (ASCII_MAGIC, header->magic)
959 || !strcmp (EBCDIC_MAGIC, header->magic))
961 else if (!strcmp (ASCII_ZMAGIC, header->magic))
965 sys_error (r, 0, _("This is not an SPSS system file."));
969 /* Identify integer format. */
970 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
972 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
974 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
976 || (r->integer_format != INTEGER_MSB_FIRST
977 && r->integer_format != INTEGER_LSB_FIRST))
979 sys_error (r, 64, _("This is not an SPSS system file."));
983 if (!read_int (r, &header->nominal_case_size))
986 if (header->nominal_case_size < 0
987 || header->nominal_case_size > INT_MAX / 16)
988 header->nominal_case_size = -1;
990 if (!read_int (r, &compressed))
995 r->compression = ANY_COMP_NONE;
996 else if (compressed == 1)
997 r->compression = ANY_COMP_SIMPLE;
998 else if (compressed != 0)
1000 sys_error (r, 0, "System file header has invalid compression "
1001 "value %d.", compressed);
1007 if (compressed == 2)
1008 r->compression = ANY_COMP_ZLIB;
1011 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1012 "compression value %d.", compressed);
1017 if (!read_int (r, &header->weight_idx))
1020 if (!read_int (r, &r->case_cnt))
1022 if ( r->case_cnt > INT_MAX / 2)
1025 /* Identify floating-point format and obtain compression bias. */
1026 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1028 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1030 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1032 if (memcmp (raw_bias, zero_bias, 8))
1033 sys_warn (r, r->pos - 8,
1034 _("Compression bias is not the usual "
1035 "value of 100, or system file uses unrecognized "
1036 "floating-point format."));
1039 /* Some software is known to write all-zeros to this
1040 field. Such software also writes floating-point
1041 numbers in the format that we expect by default
1042 (it seems that all software most likely does, in
1043 reality), so don't warn in this case. */
1046 if (r->integer_format == INTEGER_MSB_FIRST)
1047 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1049 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1051 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1053 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1054 || !read_string (r, header->creation_time, sizeof header->creation_time)
1055 || !read_string (r, header->file_label, sizeof header->file_label)
1056 || !skip_bytes (r, 3))
1059 info->integer_format = r->integer_format;
1060 info->float_format = r->float_format;
1061 info->compression = r->compression;
1062 info->case_cnt = r->case_cnt;
1067 /* Reads a variable (type 2) record from R into RECORD. */
1069 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1071 int has_variable_label;
1073 memset (record, 0, sizeof *record);
1075 record->pos = r->pos;
1076 if (!read_int (r, &record->width)
1077 || !read_int (r, &has_variable_label)
1078 || !read_int (r, &record->missing_value_code)
1079 || !read_int (r, &record->print_format)
1080 || !read_int (r, &record->write_format)
1081 || !read_string (r, record->name, sizeof record->name))
1084 if (has_variable_label == 1)
1086 enum { MAX_LABEL_LEN = 65536 };
1087 unsigned int len, read_len;
1089 if (!read_uint (r, &len))
1092 /* Read up to MAX_LABEL_LEN bytes of label. */
1093 read_len = MIN (MAX_LABEL_LEN, len);
1094 record->label = pool_malloc (r->pool, read_len + 1);
1095 if (!read_string (r, record->label, read_len + 1))
1098 /* Skip unread label bytes. */
1099 if (!skip_bytes (r, len - read_len))
1102 /* Skip label padding up to multiple of 4 bytes. */
1103 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1106 else if (has_variable_label != 0)
1108 sys_error (r, record->pos,
1109 _("Variable label indicator field is not 0 or 1."));
1113 /* Set missing values. */
1114 if (record->missing_value_code != 0)
1116 int code = record->missing_value_code;
1117 if (record->width == 0)
1119 if (code < -3 || code > 3 || code == -1)
1121 sys_error (r, record->pos,
1122 _("Numeric missing value indicator field is not "
1123 "-3, -2, 0, 1, 2, or 3."));
1129 if (code < 1 || code > 3)
1131 sys_error (r, record->pos,
1132 _("String missing value indicator field is not "
1138 if (!read_bytes (r, record->missing, 8 * abs (code)))
1145 /* Reads value labels from R into RECORD. */
1147 read_value_label_record (struct sfm_reader *r,
1148 struct sfm_value_label_record *record)
1153 /* Read type 3 record. */
1154 record->pos = r->pos;
1155 if (!read_uint (r, &record->n_labels))
1157 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1159 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1163 record->labels = pool_nmalloc (r->pool, record->n_labels,
1164 sizeof *record->labels);
1165 for (i = 0; i < record->n_labels; i++)
1167 struct sfm_value_label *label = &record->labels[i];
1168 unsigned char label_len;
1171 if (!read_bytes (r, label->value, sizeof label->value))
1174 /* Read label length. */
1175 if (!read_bytes (r, &label_len, sizeof label_len))
1177 padded_len = ROUND_UP (label_len + 1, 8);
1179 /* Read label, padding. */
1180 label->label = pool_malloc (r->pool, padded_len + 1);
1181 if (!read_bytes (r, label->label, padded_len - 1))
1183 label->label[label_len] = '\0';
1186 /* Read record type of type 4 record. */
1187 if (!read_int (r, &type))
1191 sys_error (r, r->pos - 4,
1192 _("Variable index record (type 4) does not immediately "
1193 "follow value label record (type 3) as it should."));
1197 /* Read number of variables associated with value label from type 4
1199 if (!read_uint (r, &record->n_vars))
1201 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1203 sys_error (r, r->pos - 4,
1204 _("Number of variables associated with a value label (%u) "
1205 "is not between 1 and the number of variables (%zu)."),
1206 record->n_vars, r->n_vars);
1210 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1211 for (i = 0; i < record->n_vars; i++)
1212 if (!read_int (r, &record->vars[i]))
1218 /* Reads a document record from R and returns it. */
1219 static struct sfm_document_record *
1220 read_document_record (struct sfm_reader *r)
1222 struct sfm_document_record *record;
1225 record = pool_malloc (r->pool, sizeof *record);
1226 record->pos = r->pos;
1228 if (!read_int (r, &n_lines))
1230 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1232 sys_error (r, record->pos,
1233 _("Number of document lines (%d) "
1234 "must be greater than 0 and less than %d."),
1235 n_lines, INT_MAX / DOC_LINE_LENGTH);
1239 record->n_lines = n_lines;
1240 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1241 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1248 read_extension_record_header (struct sfm_reader *r, int subtype,
1249 struct sfm_extension_record *record)
1251 record->subtype = subtype;
1252 record->pos = r->pos;
1253 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1256 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1257 allows an extra byte for a null terminator, used by some
1258 extension processing routines. */
1259 if (record->size != 0
1260 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1262 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1270 /* Reads an extension record from R into RECORD. */
1272 read_extension_record (struct sfm_reader *r, int subtype,
1273 struct sfm_extension_record **recordp)
1275 struct extension_record_type
1282 static const struct extension_record_type types[] =
1284 /* Implemented record types. */
1285 { EXT_INTEGER, 4, 8 },
1286 { EXT_FLOAT, 8, 3 },
1287 { EXT_MRSETS, 1, 0 },
1288 { EXT_PRODUCT_INFO, 1, 0 },
1289 { EXT_DISPLAY, 4, 0 },
1290 { EXT_LONG_NAMES, 1, 0 },
1291 { EXT_LONG_STRINGS, 1, 0 },
1292 { EXT_NCASES, 8, 2 },
1293 { EXT_FILE_ATTRS, 1, 0 },
1294 { EXT_VAR_ATTRS, 1, 0 },
1295 { EXT_MRSETS2, 1, 0 },
1296 { EXT_ENCODING, 1, 0 },
1297 { EXT_LONG_LABELS, 1, 0 },
1298 { EXT_LONG_MISSING, 1, 0 },
1300 /* Ignored record types. */
1301 { EXT_VAR_SETS, 0, 0 },
1303 { EXT_DATA_ENTRY, 0, 0 },
1304 { EXT_DATAVIEW, 0, 0 },
1307 const struct extension_record_type *type;
1308 struct sfm_extension_record *record;
1312 record = pool_malloc (r->pool, sizeof *record);
1313 if (!read_extension_record_header (r, subtype, record))
1315 n_bytes = record->count * record->size;
1317 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1318 if (subtype == type->subtype)
1320 if (type->size > 0 && record->size != type->size)
1321 sys_warn (r, record->pos,
1322 _("Record type 7, subtype %d has bad size %u "
1323 "(expected %d)."), subtype, record->size, type->size);
1324 else if (type->count > 0 && record->count != type->count)
1325 sys_warn (r, record->pos,
1326 _("Record type 7, subtype %d has bad count %u "
1327 "(expected %d)."), subtype, record->count, type->count);
1328 else if (type->count == 0 && type->size == 0)
1330 /* Ignore this record. */
1334 char *data = pool_malloc (r->pool, n_bytes + 1);
1335 data[n_bytes] = '\0';
1337 record->data = data;
1338 if (!read_bytes (r, record->data, n_bytes))
1347 sys_warn (r, record->pos,
1348 _("Unrecognized record type 7, subtype %d. For help, please "
1349 "send this file to %s and mention that you were using %s."),
1350 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
1353 return skip_bytes (r, n_bytes);
1357 skip_extension_record (struct sfm_reader *r, int subtype)
1359 struct sfm_extension_record record;
1361 return (read_extension_record_header (r, subtype, &record)
1362 && skip_bytes (r, record.count * record.size));
1366 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1367 struct any_read_info *info, struct dictionary *dict)
1369 const char *dict_encoding = dict_get_encoding (dict);
1370 struct substring product;
1371 struct substring label;
1374 /* Convert file label to UTF-8 and put it into DICT. */
1375 label = recode_substring_pool ("UTF-8", dict_encoding,
1376 ss_cstr (header->file_label), r->pool);
1377 ss_trim (&label, ss_cstr (" "));
1378 label.string[label.length] = '\0';
1379 fixed_label = fix_line_ends (label.string);
1380 dict_set_label (dict, fixed_label);
1383 /* Put creation date and time in UTF-8 into INFO. */
1384 info->creation_date = recode_string ("UTF-8", dict_encoding,
1385 header->creation_date, -1);
1386 info->creation_time = recode_string ("UTF-8", dict_encoding,
1387 header->creation_time, -1);
1389 /* Put product name into INFO, dropping eye-catcher string if present. */
1390 product = recode_substring_pool ("UTF-8", dict_encoding,
1391 ss_cstr (header->eye_catcher), r->pool);
1392 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1393 ss_trim (&product, ss_cstr (" "));
1394 info->product = ss_xstrdup (product);
1397 /* Reads a variable (type 2) record from R and adds the
1398 corresponding variable to DICT.
1399 Also skips past additional variable records for long string
1402 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1403 struct sfm_var_record *var_recs, size_t n_var_recs)
1405 const char *dict_encoding = dict_get_encoding (dict);
1406 struct sfm_var_record *rec;
1409 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1411 struct variable *var;
1416 name = recode_string_pool ("UTF-8", dict_encoding,
1417 rec->name, -1, r->pool);
1418 name[strcspn (name, " ")] = '\0';
1420 if (!dict_id_is_valid (dict, name, false)
1421 || name[0] == '$' || name[0] == '#')
1423 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1427 if (rec->width < 0 || rec->width > 255)
1429 sys_error (r, rec->pos,
1430 _("Bad width %d for variable %s."), rec->width, name);
1434 var = rec->var = dict_create_var (dict, name, rec->width);
1437 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1438 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1441 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1445 /* Set the short name the same as the long name. */
1446 var_set_short_name (var, 0, name);
1448 /* Get variable label, if any. */
1453 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1454 rec->label, -1, r->pool);
1455 var_set_label (var, utf8_label);
1458 /* Set missing values. */
1459 if (rec->missing_value_code != 0)
1461 int width = var_get_width (var);
1462 struct missing_values mv;
1464 mv_init_pool (r->pool, &mv, width);
1465 if (var_is_numeric (var))
1467 bool has_range = rec->missing_value_code < 0;
1468 int n_discrete = (has_range
1469 ? rec->missing_value_code == -3
1470 : rec->missing_value_code);
1475 double low = parse_float (r, rec->missing, 0);
1476 double high = parse_float (r, rec->missing, 8);
1478 /* Deal with SPSS 21 change in representation. */
1482 mv_add_range (&mv, low, high);
1486 for (i = 0; i < n_discrete; i++)
1488 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1493 for (i = 0; i < rec->missing_value_code; i++)
1494 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1495 var_set_missing_values (var, &mv);
1499 parse_format_spec (r, rec->pos + 12, rec->print_format,
1500 PRINT_FORMAT, var, &n_warnings);
1501 parse_format_spec (r, rec->pos + 16, rec->write_format,
1502 WRITE_FORMAT, var, &n_warnings);
1504 /* Account for values.
1505 Skip long string continuation records, if any. */
1506 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1507 for (i = 1; i < n_values; i++)
1508 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1510 sys_error (r, rec->pos, _("Missing string continuation record."));
1519 /* Translates the format spec from sysfile format to internal
1522 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1523 enum which_format which, struct variable *v,
1526 const int max_warnings = 8;
1527 uint8_t raw_type = format >> 16;
1528 uint8_t w = format >> 8;
1537 ok = (fmt_from_io (raw_type, &f.type)
1538 && fmt_check_output (&f)
1539 && fmt_check_width_compat (&f, var_get_width (v)));
1544 if (which == PRINT_FORMAT)
1545 var_set_print_format (v, &f);
1547 var_set_write_format (v, &f);
1549 else if (format == 0)
1551 /* Actually observed in the wild. No point in warning about it. */
1553 else if (++*n_warnings <= max_warnings)
1555 if (which == PRINT_FORMAT)
1556 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1558 var_get_name (v), var_get_width (v), format);
1560 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1562 var_get_name (v), var_get_width (v), format);
1564 if (*n_warnings == max_warnings)
1565 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1570 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1574 for (p = record->documents;
1575 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1576 p += DOC_LINE_LENGTH)
1578 struct substring line;
1580 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1581 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1582 ss_rtrim (&line, ss_cstr (" "));
1583 line.string[line.length] = '\0';
1585 dict_add_document_line (dict, line.string, false);
1591 /* Parses record type 7, subtype 3. */
1593 parse_machine_integer_info (struct sfm_reader *r,
1594 const struct sfm_extension_record *record,
1595 struct any_read_info *info)
1597 int float_representation, expected_float_format;
1598 int integer_representation, expected_integer_format;
1600 /* Save version info. */
1601 info->version_major = parse_int (r, record->data, 0);
1602 info->version_minor = parse_int (r, record->data, 4);
1603 info->version_revision = parse_int (r, record->data, 8);
1605 /* Check floating point format. */
1606 float_representation = parse_int (r, record->data, 16);
1607 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1608 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1609 expected_float_format = 1;
1610 else if (r->float_format == FLOAT_Z_LONG)
1611 expected_float_format = 2;
1612 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1613 expected_float_format = 3;
1616 if (float_representation != expected_float_format)
1618 sys_error (r, record->pos,
1619 _("Floating-point representation indicated by "
1620 "system file (%d) differs from expected (%d)."),
1621 float_representation, expected_float_format);
1625 /* Check integer format. */
1626 integer_representation = parse_int (r, record->data, 24);
1627 if (r->integer_format == INTEGER_MSB_FIRST)
1628 expected_integer_format = 1;
1629 else if (r->integer_format == INTEGER_LSB_FIRST)
1630 expected_integer_format = 2;
1633 if (integer_representation != expected_integer_format)
1634 sys_warn (r, record->pos,
1635 _("Integer format indicated by system file (%d) "
1636 "differs from expected (%d)."),
1637 integer_representation, expected_integer_format);
1642 /* Parses record type 7, subtype 4. */
1644 parse_machine_float_info (struct sfm_reader *r,
1645 const struct sfm_extension_record *record)
1647 double sysmis = parse_float (r, record->data, 0);
1648 double highest = parse_float (r, record->data, 8);
1649 double lowest = parse_float (r, record->data, 16);
1651 if (sysmis != SYSMIS)
1652 sys_warn (r, record->pos,
1653 _("File specifies unexpected value %g (%a) as %s, "
1654 "instead of %g (%a)."),
1655 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1657 if (highest != HIGHEST)
1658 sys_warn (r, record->pos,
1659 _("File specifies unexpected value %g (%a) as %s, "
1660 "instead of %g (%a)."),
1661 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1663 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1664 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1665 appears in a context (missing values) where SYSMIS cannot. */
1666 if (lowest != LOWEST && lowest != SYSMIS)
1667 sys_warn (r, record->pos,
1668 _("File specifies unexpected value %g (%a) as %s, "
1669 "instead of %g (%a) or %g (%a)."),
1670 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1673 /* Parses record type 7, subtype 10. */
1675 parse_extra_product_info (struct sfm_reader *r,
1676 const struct sfm_extension_record *record,
1677 struct any_read_info *info)
1679 struct text_record *text;
1681 text = open_text_record (r, record, true);
1682 info->product_ext = fix_line_ends (text_get_all (text));
1683 close_text_record (r, text);
1686 /* Parses record type 7, subtype 7 or 19. */
1688 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1689 size_t *allocated_mrsets)
1691 struct text_record *text;
1693 text = open_text_record (r, record, false);
1696 struct sfm_mrset *mrset;
1697 size_t allocated_vars;
1700 /* Skip extra line feeds if present. */
1701 while (text_match (text, '\n'))
1704 if (r->n_mrsets >= *allocated_mrsets)
1705 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1707 mrset = &r->mrsets[r->n_mrsets];
1708 memset(mrset, 0, sizeof *mrset);
1710 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1711 if (mrset->name == NULL)
1714 if (text_match (text, 'C'))
1716 mrset->type = MRSET_MC;
1717 if (!text_match (text, ' '))
1719 sys_warn (r, record->pos,
1720 _("Missing space following `%c' at offset %zu "
1721 "in MRSETS record."), 'C', text_pos (text));
1725 else if (text_match (text, 'D'))
1727 mrset->type = MRSET_MD;
1728 mrset->cat_source = MRSET_VARLABELS;
1730 else if (text_match (text, 'E'))
1734 mrset->type = MRSET_MD;
1735 mrset->cat_source = MRSET_COUNTEDVALUES;
1736 if (!text_match (text, ' '))
1738 sys_warn (r, record->pos,
1739 _("Missing space following `%c' at offset %zu "
1740 "in MRSETS record."), 'E', text_pos (text));
1744 number = text_get_token (text, ss_cstr (" "), NULL);
1745 if (!strcmp (number, "11"))
1746 mrset->label_from_var_label = true;
1747 else if (strcmp (number, "1"))
1748 sys_warn (r, record->pos,
1749 _("Unexpected label source value following `E' "
1750 "at offset %zu in MRSETS record."),
1755 sys_warn (r, record->pos,
1756 _("Missing `C', `D', or `E' at offset %zu "
1757 "in MRSETS record."),
1762 if (mrset->type == MRSET_MD)
1764 mrset->counted = text_parse_counted_string (r, text);
1765 if (mrset->counted == NULL)
1769 mrset->label = text_parse_counted_string (r, text);
1770 if (mrset->label == NULL)
1778 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1781 if (delimiter != '\n')
1782 sys_warn (r, record->pos,
1783 _("Missing new-line parsing variable names "
1784 "at offset %zu in MRSETS record."),
1789 if (mrset->n_vars >= allocated_vars)
1790 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1792 sizeof *mrset->vars);
1793 mrset->vars[mrset->n_vars++] = var;
1795 while (delimiter != '\n');
1799 close_text_record (r, text);
1803 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1805 const struct sfm_mrset *s;
1807 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1809 struct stringi_set var_names;
1810 struct mrset *mrset;
1815 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1818 sys_warn (r, -1, _("Multiple response set name `%s' does not begin "
1825 mrset = xzalloc (sizeof *mrset);
1827 mrset->type = s->type;
1828 mrset->cat_source = s->cat_source;
1829 mrset->label_from_var_label = s->label_from_var_label;
1830 if (s->label[0] != '\0')
1831 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1833 stringi_set_init (&var_names);
1834 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1836 for (i = 0; i < s->n_vars; i++)
1838 struct variable *var;
1841 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1843 var = dict_lookup_var (dict, var_name);
1849 if (!stringi_set_insert (&var_names, var_name))
1852 _("MRSET %s contains duplicate variable name %s."),
1853 mrset->name, var_name);
1859 if (mrset->label == NULL && mrset->label_from_var_label
1860 && var_has_label (var))
1861 mrset->label = xstrdup (var_get_label (var));
1864 && var_get_type (var) != var_get_type (mrset->vars[0]))
1867 _("MRSET %s contains both string and "
1868 "numeric variables."), mrset->name);
1871 width = MIN (width, var_get_width (var));
1873 mrset->vars[mrset->n_vars++] = var;
1876 if (mrset->n_vars < 2)
1878 if (mrset->n_vars == 0)
1879 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1881 sys_warn (r, -1, _("MRSET %s has only one variable."),
1883 mrset_destroy (mrset);
1884 stringi_set_destroy (&var_names);
1888 if (mrset->type == MRSET_MD)
1890 mrset->width = width;
1891 value_init (&mrset->counted, width);
1893 mrset->counted.f = c_strtod (s->counted, NULL);
1895 value_copy_str_rpad (&mrset->counted, width,
1896 (const uint8_t *) s->counted, ' ');
1899 dict_add_mrset (dict, mrset);
1900 stringi_set_destroy (&var_names);
1904 /* Read record type 7, subtype 11, which specifies how variables
1905 should be displayed in GUI environments. */
1907 parse_display_parameters (struct sfm_reader *r,
1908 const struct sfm_extension_record *record,
1909 struct dictionary *dict)
1911 bool includes_width;
1912 bool warned = false;
1917 n_vars = dict_get_var_cnt (dict);
1918 if (record->count == 3 * n_vars)
1919 includes_width = true;
1920 else if (record->count == 2 * n_vars)
1921 includes_width = false;
1924 sys_warn (r, record->pos,
1925 _("Extension 11 has bad count %u (for %zu variables)."),
1926 record->count, n_vars);
1931 for (i = 0; i < n_vars; ++i)
1933 struct variable *v = dict_get_var (dict, i);
1934 int measure, width, align;
1936 measure = parse_int (r, record->data, ofs);
1941 width = parse_int (r, record->data, ofs);
1947 align = parse_int (r, record->data, ofs);
1950 /* SPSS sometimes seems to set variables' measure to zero. */
1954 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1957 sys_warn (r, record->pos,
1958 _("Invalid variable display parameters for variable "
1959 "%zu (%s). Default parameters substituted."),
1960 i, var_get_name (v));
1965 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1966 : measure == 2 ? MEASURE_ORDINAL
1968 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1969 : align == 1 ? ALIGN_RIGHT
1972 /* Older versions (SPSS 9.0) sometimes set the display
1973 width to zero. This causes confusion in the GUI, so
1974 only set the width if it is nonzero. */
1976 var_set_display_width (v, width);
1981 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1982 const char *new_name)
1984 size_t n_short_names;
1988 /* Renaming a variable may clear its short names, but we
1989 want to retain them, so we save them and re-set them
1991 n_short_names = var_get_short_name_cnt (var);
1992 short_names = xnmalloc (n_short_names, sizeof *short_names);
1993 for (i = 0; i < n_short_names; i++)
1995 const char *s = var_get_short_name (var, i);
1996 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1999 /* Set long name. */
2000 dict_rename_var (dict, var, new_name);
2002 /* Restore short names. */
2003 for (i = 0; i < n_short_names; i++)
2005 var_set_short_name (var, i, short_names[i]);
2006 free (short_names[i]);
2011 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2012 to each short name. Modifies variable names in DICT accordingly. */
2014 parse_long_var_name_map (struct sfm_reader *r,
2015 const struct sfm_extension_record *record,
2016 struct dictionary *dict)
2018 struct text_record *text;
2019 struct variable *var;
2024 /* There are no long variable names. Use the short variable names,
2025 converted to lowercase, as the long variable names. */
2028 for (i = 0; i < dict_get_var_cnt (dict); i++)
2030 struct variable *var = dict_get_var (dict, i);
2033 new_name = utf8_to_lower (var_get_name (var));
2034 rename_var_and_save_short_names (dict, var, new_name);
2041 /* Rename each of the variables, one by one. (In a correctly constructed
2042 system file, this cannot create any intermediate duplicate variable names,
2043 because all of the new variable names are longer than any of the old
2044 variable names and thus there cannot be any overlaps.) */
2045 text = open_text_record (r, record, true);
2046 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2048 /* Validate long name. */
2049 if (!dict_id_is_valid (dict, long_name, false)
2050 || long_name[0] == '$' || long_name[0] == '#')
2052 sys_warn (r, record->pos,
2053 _("Long variable mapping from %s to invalid "
2054 "variable name `%s'."),
2055 var_get_name (var), long_name);
2059 /* Identify any duplicates. */
2060 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
2061 && dict_lookup_var (dict, long_name) != NULL)
2063 sys_warn (r, record->pos,
2064 _("Duplicate long variable name `%s'."), long_name);
2068 rename_var_and_save_short_names (dict, var, long_name);
2070 close_text_record (r, text);
2073 /* Reads record type 7, subtype 14, which gives the real length
2074 of each very long string. Rearranges DICT accordingly. */
2076 parse_long_string_map (struct sfm_reader *r,
2077 const struct sfm_extension_record *record,
2078 struct dictionary *dict)
2080 struct text_record *text;
2081 struct variable *var;
2084 text = open_text_record (r, record, true);
2085 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2087 size_t idx = var_get_dict_index (var);
2093 length = strtol (length_s, NULL, 10);
2094 if (length < 1 || length > MAX_STRING)
2096 sys_warn (r, record->pos,
2097 _("%s listed as string of invalid length %s "
2098 "in very long string record."),
2099 var_get_name (var), length_s);
2103 /* Check segments. */
2104 segment_cnt = sfm_width_to_segments (length);
2105 if (segment_cnt == 1)
2107 sys_warn (r, record->pos,
2108 _("%s listed in very long string record with width %s, "
2109 "which requires only one segment."),
2110 var_get_name (var), length_s);
2113 if (idx + segment_cnt > dict_get_var_cnt (dict))
2115 sys_error (r, record->pos,
2116 _("Very long string %s overflows dictionary."),
2117 var_get_name (var));
2121 /* Get the short names from the segments and check their
2123 for (i = 0; i < segment_cnt; i++)
2125 struct variable *seg = dict_get_var (dict, idx + i);
2126 int alloc_width = sfm_segment_alloc_width (length, i);
2127 int width = var_get_width (seg);
2130 var_set_short_name (var, i, var_get_short_name (seg, 0));
2131 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2133 sys_error (r, record->pos,
2134 _("Very long string with width %ld has segment %d "
2135 "of width %d (expected %d)."),
2136 length, i, width, alloc_width);
2140 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
2141 var_set_width (var, length);
2143 close_text_record (r, text);
2144 dict_compact_values (dict);
2150 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
2151 const struct sfm_var_record *var_recs, size_t n_var_recs,
2152 const struct sfm_value_label_record *record)
2154 struct variable **vars;
2158 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2159 for (i = 0; i < record->n_labels; i++)
2160 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2161 record->labels[i].label, -1,
2164 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
2165 for (i = 0; i < record->n_vars; i++)
2167 vars[i] = lookup_var_by_index (r, record->pos,
2168 var_recs, n_var_recs, record->vars[i]);
2169 if (vars[i] == NULL)
2173 for (i = 1; i < record->n_vars; i++)
2174 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2176 sys_error (r, record->pos,
2177 _("Variables associated with value label are not all of "
2178 "identical type. Variable %s is %s, but variable "
2180 var_get_name (vars[0]),
2181 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2182 var_get_name (vars[i]),
2183 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2187 for (i = 0; i < record->n_vars; i++)
2189 struct variable *var = vars[i];
2193 width = var_get_width (var);
2196 sys_error (r, record->pos,
2197 _("Value labels may not be added to long string "
2198 "variables (e.g. %s) using records types 3 and 4."),
2199 var_get_name (var));
2203 for (j = 0; j < record->n_labels; j++)
2205 struct sfm_value_label *label = &record->labels[j];
2208 value_init (&value, width);
2210 value.f = parse_float (r, label->value, 0);
2212 memcpy (value_str_rw (&value, width), label->value, width);
2214 if (!var_add_value_label (var, &value, utf8_labels[j]))
2216 if (var_is_numeric (var))
2217 sys_warn (r, record->pos,
2218 _("Duplicate value label for %g on %s."),
2219 value.f, var_get_name (var));
2221 sys_warn (r, record->pos,
2222 _("Duplicate value label for `%.*s' on %s."),
2223 width, value_str (&value, width),
2224 var_get_name (var));
2227 value_destroy (&value, width);
2231 pool_free (r->pool, vars);
2232 for (i = 0; i < record->n_labels; i++)
2233 pool_free (r->pool, utf8_labels[i]);
2234 pool_free (r->pool, utf8_labels);
2239 static struct variable *
2240 lookup_var_by_index (struct sfm_reader *r, off_t offset,
2241 const struct sfm_var_record *var_recs, size_t n_var_recs,
2244 const struct sfm_var_record *rec;
2246 if (idx < 1 || idx > n_var_recs)
2248 sys_error (r, offset,
2249 _("Variable index %d not in valid range 1...%zu."),
2254 rec = &var_recs[idx - 1];
2255 if (rec->var == NULL)
2257 sys_error (r, offset,
2258 _("Variable index %d refers to long string continuation."),
2266 /* Parses a set of custom attributes from TEXT into ATTRS.
2267 ATTRS may be a null pointer, in which case the attributes are
2268 read but discarded. */
2270 parse_attributes (struct sfm_reader *r, struct text_record *text,
2271 struct attrset *attrs)
2275 struct attribute *attr;
2279 /* Parse the key. */
2280 key = text_get_token (text, ss_cstr ("("), NULL);
2284 attr = attribute_create (key);
2285 for (index = 1; ; index++)
2287 /* Parse the value. */
2291 value = text_get_token (text, ss_cstr ("\n"), NULL);
2294 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2299 length = strlen (value);
2300 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2302 value[length - 1] = '\0';
2303 attribute_add_value (attr, value + 1);
2308 _("Attribute value %s[%d] is not quoted: %s."),
2310 attribute_add_value (attr, value);
2313 /* Was this the last value for this attribute? */
2314 if (text_match (text, ')'))
2318 attrset_add (attrs, attr);
2320 attribute_destroy (attr);
2322 while (!text_match (text, '/'));
2325 /* Reads record type 7, subtype 17, which lists custom
2326 attributes on the data file. */
2328 parse_data_file_attributes (struct sfm_reader *r,
2329 const struct sfm_extension_record *record,
2330 struct dictionary *dict)
2332 struct text_record *text = open_text_record (r, record, true);
2333 parse_attributes (r, text, dict_get_attributes (dict));
2334 close_text_record (r, text);
2337 /* Parses record type 7, subtype 18, which lists custom
2338 attributes on individual variables. */
2340 parse_variable_attributes (struct sfm_reader *r,
2341 const struct sfm_extension_record *record,
2342 struct dictionary *dict)
2344 struct text_record *text;
2345 struct variable *var;
2347 text = open_text_record (r, record, true);
2348 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2349 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2350 close_text_record (r, text);
2354 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2356 size_t n_warnings = 0;
2359 for (i = 0; i < dict_get_var_cnt (dict); i++)
2361 struct variable *var = dict_get_var (dict, i);
2362 struct attrset *attrs = var_get_attributes (var);
2363 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2366 int value = atoi (attribute_get_value (attr, 0));
2388 role = ROLE_PARTITION;
2397 if (n_warnings++ == 0)
2398 sys_warn (r, -1, _("Invalid role for variable %s."),
2399 var_get_name (var));
2402 var_set_role (var, role);
2407 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2412 check_overflow (struct sfm_reader *r,
2413 const struct sfm_extension_record *record,
2414 size_t ofs, size_t length)
2416 size_t end = record->size * record->count;
2417 if (length >= end || ofs + length > end)
2419 sys_warn (r, record->pos + end,
2420 _("Extension record subtype %d ends unexpectedly."),
2428 parse_long_string_value_labels (struct sfm_reader *r,
2429 const struct sfm_extension_record *record,
2430 struct dictionary *dict)
2432 const char *dict_encoding = dict_get_encoding (dict);
2433 size_t end = record->size * record->count;
2440 struct variable *var;
2445 /* Parse variable name length. */
2446 if (!check_overflow (r, record, ofs, 4))
2448 var_name_len = parse_int (r, record->data, ofs);
2451 /* Parse variable name, width, and number of labels. */
2452 if (!check_overflow (r, record, ofs, var_name_len + 8))
2454 var_name = recode_string_pool ("UTF-8", dict_encoding,
2455 (const char *) record->data + ofs,
2456 var_name_len, r->pool);
2457 width = parse_int (r, record->data, ofs + var_name_len);
2458 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2459 ofs += var_name_len + 8;
2461 /* Look up 'var' and validate. */
2462 var = dict_lookup_var (dict, var_name);
2464 sys_warn (r, record->pos + ofs,
2465 _("Ignoring long string value label record for "
2466 "unknown variable %s."), var_name);
2467 else if (var_is_numeric (var))
2469 sys_warn (r, record->pos + ofs,
2470 _("Ignoring long string value label record for "
2471 "numeric variable %s."), var_name);
2474 else if (width != var_get_width (var))
2476 sys_warn (r, record->pos + ofs,
2477 _("Ignoring long string value label record for variable "
2478 "%s because the record's width (%d) does not match the "
2479 "variable's width (%d)."),
2480 var_name, width, var_get_width (var));
2485 value_init_pool (r->pool, &value, width);
2486 for (i = 0; i < n_labels; i++)
2488 size_t value_length, label_length;
2489 bool skip = var == NULL;
2491 /* Parse value length. */
2492 if (!check_overflow (r, record, ofs, 4))
2494 value_length = parse_int (r, record->data, ofs);
2498 if (!check_overflow (r, record, ofs, value_length))
2502 if (value_length == width)
2503 memcpy (value_str_rw (&value, width),
2504 (const uint8_t *) record->data + ofs, width);
2507 sys_warn (r, record->pos + ofs,
2508 _("Ignoring long string value label %zu for "
2509 "variable %s, with width %d, that has bad value "
2511 i, var_get_name (var), width, value_length);
2515 ofs += value_length;
2517 /* Parse label length. */
2518 if (!check_overflow (r, record, ofs, 4))
2520 label_length = parse_int (r, record->data, ofs);
2524 if (!check_overflow (r, record, ofs, label_length))
2530 label = recode_string_pool ("UTF-8", dict_encoding,
2531 (const char *) record->data + ofs,
2532 label_length, r->pool);
2533 if (!var_add_value_label (var, &value, label))
2534 sys_warn (r, record->pos + ofs,
2535 _("Duplicate value label for `%.*s' on %s."),
2536 width, value_str (&value, width),
2537 var_get_name (var));
2538 pool_free (r->pool, label);
2540 ofs += label_length;
2546 parse_long_string_missing_values (struct sfm_reader *r,
2547 const struct sfm_extension_record *record,
2548 struct dictionary *dict)
2550 const char *dict_encoding = dict_get_encoding (dict);
2551 size_t end = record->size * record->count;
2556 struct missing_values mv;
2558 struct variable *var;
2559 int n_missing_values;
2563 /* Parse variable name length. */
2564 if (!check_overflow (r, record, ofs, 4))
2566 var_name_len = parse_int (r, record->data, ofs);
2569 /* Parse variable name. */
2570 if (!check_overflow (r, record, ofs, var_name_len + 1))
2572 var_name = recode_string_pool ("UTF-8", dict_encoding,
2573 (const char *) record->data + ofs,
2574 var_name_len, r->pool);
2575 ofs += var_name_len;
2577 /* Parse number of missing values. */
2578 n_missing_values = ((const uint8_t *) record->data)[ofs];
2579 if (n_missing_values < 1 || n_missing_values > 3)
2580 sys_warn (r, record->pos + ofs,
2581 _("Long string missing values record says variable %s "
2582 "has %d missing values, but only 1 to 3 missing values "
2584 var_name, n_missing_values);
2587 /* Look up 'var' and validate. */
2588 var = dict_lookup_var (dict, var_name);
2590 sys_warn (r, record->pos + ofs,
2591 _("Ignoring long string missing value record for "
2592 "unknown variable %s."), var_name);
2593 else if (var_is_numeric (var))
2595 sys_warn (r, record->pos + ofs,
2596 _("Ignoring long string missing value record for "
2597 "numeric variable %s."), var_name);
2602 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2603 for (i = 0; i < n_missing_values; i++)
2605 size_t value_length;
2607 /* Parse value length. */
2608 if (!check_overflow (r, record, ofs, 4))
2610 value_length = parse_int (r, record->data, ofs);
2614 if (!check_overflow (r, record, ofs, value_length))
2618 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2620 sys_warn (r, record->pos + ofs,
2621 _("Ignoring long string missing value %zu for variable "
2622 "%s, with width %d, that has bad value width %zu."),
2623 i, var_get_name (var), var_get_width (var),
2625 ofs += value_length;
2628 var_set_missing_values (var, &mv);
2634 static void partial_record (struct sfm_reader *);
2636 static void read_error (struct casereader *, const struct sfm_reader *);
2638 static bool read_case_number (struct sfm_reader *, double *);
2639 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2640 static int read_opcode (struct sfm_reader *);
2641 static bool read_compressed_number (struct sfm_reader *, double *);
2642 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2643 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2644 static bool skip_whole_strings (struct sfm_reader *, size_t);
2646 /* Reads and returns one case from READER's file. Returns a null
2647 pointer if not successful. */
2648 static struct ccase *
2649 sys_file_casereader_read (struct casereader *reader, void *r_)
2651 struct sfm_reader *r = r_;
2656 if (r->error || !r->sfm_var_cnt)
2659 c = case_create (r->proto);
2661 for (i = 0; i < r->sfm_var_cnt; i++)
2663 struct sfm_var *sv = &r->sfm_vars[i];
2664 union value *v = case_data_rw_idx (c, sv->case_index);
2666 if (sv->var_width == 0)
2667 retval = read_case_number (r, &v->f);
2670 uint8_t *s = value_str_rw (v, sv->var_width);
2671 retval = read_case_string (r, s + sv->offset, sv->segment_width);
2674 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2676 sys_error (r, r->pos, _("File ends in partial string value."));
2688 if (r->case_cnt != -1)
2689 read_error (reader, r);
2694 /* Issues an error that R ends in a partial record. */
2696 partial_record (struct sfm_reader *r)
2698 sys_error (r, r->pos, _("File ends in partial case."));
2701 /* Issues an error that an unspecified error occurred SFM, and
2704 read_error (struct casereader *r, const struct sfm_reader *sfm)
2706 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2707 casereader_force_error (r);
2710 /* Reads a number from R and stores its value in *D.
2711 If R is compressed, reads a compressed number;
2712 otherwise, reads a number in the regular way.
2713 Returns true if successful, false if end of file is
2714 reached immediately. */
2716 read_case_number (struct sfm_reader *r, double *d)
2718 if (r->compression == ANY_COMP_NONE)
2721 if (!try_read_bytes (r, number, sizeof number))
2723 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2727 return read_compressed_number (r, d);
2730 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2731 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2732 discarded without being written to S. Reads compressed strings if S is
2733 compressed. Returns 1 if successful, 0 if end of file is reached
2734 immediately, or -1 for some kind of error. */
2736 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2738 size_t whole = ROUND_DOWN (length, 8);
2739 size_t partial = length % 8;
2743 int retval = read_whole_strings (r, s, whole);
2751 int retval = read_whole_strings (r, bounce, sizeof bounce);
2763 memcpy (s + whole, bounce, partial);
2769 /* Reads and returns the next compression opcode from R. */
2771 read_opcode (struct sfm_reader *r)
2773 assert (r->compression != ANY_COMP_NONE);
2777 if (r->opcode_idx >= sizeof r->opcodes)
2780 int retval = try_read_compressed_bytes (r, r->opcodes,
2786 opcode = r->opcodes[r->opcode_idx++];
2793 /* Reads a compressed number from R and stores its value in D.
2794 Returns true if successful, false if end of file is
2795 reached immediately. */
2797 read_compressed_number (struct sfm_reader *r, double *d)
2799 int opcode = read_opcode (r);
2807 return read_compressed_float (r, d);
2810 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2811 if (!r->corruption_warning)
2813 r->corruption_warning = true;
2814 sys_warn (r, r->pos,
2815 _("Possible compressed data corruption: "
2816 "compressed spaces appear in numeric field."));
2825 *d = opcode - r->bias;
2832 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2834 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2839 opcode = read_opcode (r);
2847 retval = read_compressed_bytes (r, dst, 8);
2848 return retval == 1 ? 1 : -1;
2851 memset (dst, ' ', 8);
2856 double value = opcode - r->bias;
2857 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2860 /* This has actually been seen "in the wild". The submitter of the
2861 file that showed that the contents decoded as spaces, but they
2862 were at the end of the field so it's possible that the null
2863 bytes just acted as null terminators. */
2865 else if (!r->corruption_warning)
2867 r->corruption_warning = true;
2868 sys_warn (r, r->pos,
2869 _("Possible compressed data corruption: "
2870 "string contains compressed integer (opcode %d)."),
2878 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2879 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2880 end of file is reached immediately, or -1 for some kind of error. */
2882 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2884 assert (length % 8 == 0);
2885 if (r->compression == ANY_COMP_NONE)
2886 return try_read_bytes (r, s, length);
2891 for (ofs = 0; ofs < length; ofs += 8)
2893 int retval = read_compressed_string (r, s + ofs);
2908 /* Skips LENGTH string bytes from R.
2909 LENGTH must be a multiple of 8.
2910 (LENGTH is also limited to 1024, but that's only because the
2911 current caller never needs more than that many bytes.)
2912 Returns true if successful, false if end of file is
2913 reached immediately. */
2915 skip_whole_strings (struct sfm_reader *r, size_t length)
2917 uint8_t buffer[1024];
2918 assert (length < sizeof buffer);
2919 return read_whole_strings (r, buffer, length);
2922 /* Helpers for reading records that contain structured text
2925 /* Maximum number of warnings to issue for a single text
2927 #define MAX_TEXT_WARNINGS 5
2932 struct substring buffer; /* Record contents. */
2933 off_t start; /* Starting offset in file. */
2934 size_t pos; /* Current position in buffer. */
2935 int n_warnings; /* Number of warnings issued or suppressed. */
2936 bool recoded; /* Recoded into UTF-8? */
2939 static struct text_record *
2940 open_text_record (struct sfm_reader *r,
2941 const struct sfm_extension_record *record,
2942 bool recode_to_utf8)
2944 struct text_record *text;
2945 struct substring raw;
2947 text = pool_alloc (r->pool, sizeof *text);
2948 raw = ss_buffer (record->data, record->size * record->count);
2949 text->start = record->pos;
2950 text->buffer = (recode_to_utf8
2951 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2954 text->n_warnings = 0;
2955 text->recoded = recode_to_utf8;
2960 /* Closes TEXT, frees its storage, and issues a final warning
2961 about suppressed warnings if necesary. */
2963 close_text_record (struct sfm_reader *r, struct text_record *text)
2965 if (text->n_warnings > MAX_TEXT_WARNINGS)
2966 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2967 text->n_warnings - MAX_TEXT_WARNINGS);
2969 pool_free (r->pool, ss_data (text->buffer));
2972 /* Reads a variable=value pair from TEXT.
2973 Looks up the variable in DICT and stores it into *VAR.
2974 Stores a null-terminated value into *VALUE. */
2976 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2977 struct text_record *text,
2978 struct variable **var, char **value)
2982 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2985 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2989 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2990 ss_buffer ("\t\0", 2));
2998 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2999 struct text_record *text, struct substring delimiters,
3000 struct variable **var)
3004 name = text_get_token (text, delimiters, NULL);
3008 *var = dict_lookup_var (dict, name);
3012 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3019 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3020 struct text_record *text, struct substring delimiters,
3021 struct variable **var)
3023 char *short_name = text_get_token (text, delimiters, NULL);
3024 if (short_name == NULL)
3027 *var = dict_lookup_var (dict, short_name);
3029 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3034 /* Displays a warning for the current file position, limiting the
3035 number to MAX_TEXT_WARNINGS for TEXT. */
3037 text_warn (struct sfm_reader *r, struct text_record *text,
3038 const char *format, ...)
3040 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3044 va_start (args, format);
3045 sys_msg (r, text->start + text->pos, MW, format, args);
3051 text_get_token (struct text_record *text, struct substring delimiters,
3054 struct substring token;
3057 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3060 end = &ss_data (token)[ss_length (token)];
3061 if (delimiter != NULL)
3064 return ss_data (token);
3067 /* Reads a integer value expressed in decimal, then a space, then a string that
3068 consists of exactly as many bytes as specified by the integer, then a space,
3069 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3070 buffer (so the caller should not free the string). */
3072 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3080 while (text->pos < text->buffer.length)
3082 int c = text->buffer.string[text->pos];
3083 if (c < '0' || c > '9')
3085 n = (n * 10) + (c - '0');
3088 if (text->pos >= text->buffer.length || start == text->pos)
3090 sys_warn (r, text->start,
3091 _("Expecting digit at offset %zu in MRSETS record."),
3096 if (!text_match (text, ' '))
3098 sys_warn (r, text->start,
3099 _("Expecting space at offset %zu in MRSETS record."),
3104 if (text->pos + n > text->buffer.length)
3106 sys_warn (r, text->start,
3107 _("%zu-byte string starting at offset %zu "
3108 "exceeds record length %zu."),
3109 n, text->pos, text->buffer.length);
3113 s = &text->buffer.string[text->pos];
3116 sys_warn (r, text->start,
3117 _("Expecting space at offset %zu following %zu-byte string."),
3127 text_match (struct text_record *text, char c)
3129 if (text->buffer.string[text->pos] == c)
3138 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3139 inside the TEXT's string. */
3141 text_pos (const struct text_record *text)
3147 text_get_all (const struct text_record *text)
3149 return text->buffer.string;
3154 /* Displays a corruption message. */
3156 sys_msg (struct sfm_reader *r, off_t offset,
3157 int class, const char *format, va_list args)
3162 ds_init_empty (&text);
3164 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3165 fh_get_file_name (r->fh), (long long int) offset);
3167 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3168 ds_put_vformat (&text, format, args);
3170 m.category = msg_class_to_category (class);
3171 m.severity = msg_class_to_severity (class);
3177 m.text = ds_cstr (&text);
3182 /* Displays a warning for offset OFFSET in the file. */
3184 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3188 va_start (args, format);
3189 sys_msg (r, offset, MW, format, args);
3193 /* Displays an error for the current file position and marks it as in an error
3196 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3200 va_start (args, format);
3201 sys_msg (r, offset, ME, format, args);
3207 /* Reads BYTE_CNT bytes into BUF.
3208 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3209 Returns -1 if an I/O error or a partial read occurs.
3210 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3213 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3214 void *buf, size_t byte_cnt)
3216 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3217 r->pos += bytes_read;
3218 if (bytes_read == byte_cnt)
3220 else if (ferror (r->file))
3222 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3225 else if (!eof_is_ok || bytes_read != 0)
3227 sys_error (r, r->pos, _("Unexpected end of file."));
3234 /* Reads BYTE_CNT into BUF.
3235 Returns true if successful.
3236 Returns false upon I/O error or if end-of-file is encountered. */
3238 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3240 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3243 /* Reads BYTE_CNT bytes into BUF.
3244 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3245 Returns 0 if an immediate end-of-file is encountered.
3246 Returns -1 if an I/O error or a partial read occurs. */
3248 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3250 return read_bytes_internal (r, true, buf, byte_cnt);
3253 /* Reads a 32-bit signed integer from R and stores its value in host format in
3254 *X. Returns true if successful, otherwise false. */
3256 read_int (struct sfm_reader *r, int *x)
3259 if (read_bytes (r, integer, sizeof integer) != 1)
3261 *x = integer_get (r->integer_format, integer, sizeof integer);
3266 read_uint (struct sfm_reader *r, unsigned int *x)
3271 ok = read_int (r, &y);
3276 /* Reads a 64-bit signed integer from R and returns its value in
3279 read_int64 (struct sfm_reader *r, long long int *x)
3282 if (read_bytes (r, integer, sizeof integer) != 1)
3284 *x = integer_get (r->integer_format, integer, sizeof integer);
3288 /* Reads a 64-bit signed integer from R and returns its value in
3291 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3296 ok = read_int64 (r, &y);
3302 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3304 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3308 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3310 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3313 /* Reads exactly SIZE - 1 bytes into BUFFER
3314 and stores a null byte into BUFFER[SIZE - 1]. */
3316 read_string (struct sfm_reader *r, char *buffer, size_t size)
3321 ok = read_bytes (r, buffer, size - 1);
3323 buffer[size - 1] = '\0';
3327 /* Skips BYTES bytes forward in R. */
3329 skip_bytes (struct sfm_reader *r, size_t bytes)
3334 size_t chunk = MIN (sizeof buffer, bytes);
3335 if (!read_bytes (r, buffer, chunk))
3343 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3344 been replaced by LFs.
3346 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3347 files that use CR-only line ends in the file label and extra product
3350 fix_line_ends (const char *s)
3354 d = dst = xmalloc (strlen (s) + 1);
3373 read_ztrailer (struct sfm_reader *r,
3374 long long int zheader_ofs,
3375 long long int ztrailer_len);
3378 zalloc (voidpf pool_, uInt items, uInt size)
3380 struct pool *pool = pool_;
3382 return (!size || xalloc_oversized (items, size)
3384 : pool_malloc (pool, items * size));
3388 zfree (voidpf pool_, voidpf address)
3390 struct pool *pool = pool_;
3392 pool_free (pool, address);
3396 read_zheader (struct sfm_reader *r)
3399 long long int zheader_ofs;
3400 long long int ztrailer_ofs;
3401 long long int ztrailer_len;
3403 if (!read_int64 (r, &zheader_ofs)
3404 || !read_int64 (r, &ztrailer_ofs)
3405 || !read_int64 (r, &ztrailer_len))
3408 if (zheader_ofs != pos)
3410 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3411 "(expected %#llx)."),
3412 zheader_ofs, (long long int) pos);
3416 if (ztrailer_ofs < r->pos)
3418 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3423 if (ztrailer_len < 24 || ztrailer_len % 24)
3425 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3429 r->ztrailer_ofs = ztrailer_ofs;
3430 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3433 if (r->zin_buf == NULL)
3435 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3436 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3437 r->zstream.next_in = NULL;
3438 r->zstream.avail_in = 0;
3441 r->zstream.zalloc = zalloc;
3442 r->zstream.zfree = zfree;
3443 r->zstream.opaque = r->pool;
3445 return open_zstream (r);
3449 seek (struct sfm_reader *r, off_t offset)
3451 if (fseeko (r->file, offset, SEEK_SET))
3452 sys_error (r, 0, _("%s: seek failed (%s)."),
3453 fh_get_file_name (r->fh), strerror (errno));
3457 /* Performs some additional consistency checks on the ZLIB compressed data
3460 read_ztrailer (struct sfm_reader *r,
3461 long long int zheader_ofs,
3462 long long int ztrailer_len)
3464 long long int expected_uncmp_ofs;
3465 long long int expected_cmp_ofs;
3468 unsigned int block_size;
3469 unsigned int n_blocks;
3473 if (fstat (fileno (r->file), &s))
3475 sys_error (ME, 0, _("%s: stat failed (%s)."),
3476 fh_get_file_name (r->fh), strerror (errno));
3480 if (!S_ISREG (s.st_mode))
3482 /* We can't seek to the trailer and then back to the data in this file,
3483 so skip doing extra checks. */
3487 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3488 sys_warn (r, r->pos,
3489 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3490 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3492 seek (r, r->ztrailer_ofs);
3494 /* Read fixed header from ZLIB data trailer. */
3495 if (!read_int64 (r, &bias))
3497 if (-bias != r->bias)
3499 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3500 "file header bias (%.2f)."),
3505 if (!read_int64 (r, &zero))
3508 sys_warn (r, r->pos,
3509 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3511 if (!read_uint (r, &block_size))
3513 if (block_size != ZBLOCK_SIZE)
3514 sys_warn (r, r->pos,
3515 _("ZLIB trailer specifies unexpected %u-byte block size."),
3518 if (!read_uint (r, &n_blocks))
3520 if (n_blocks != (ztrailer_len - 24) / 24)
3522 sys_error (r, r->pos,
3523 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3525 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3529 expected_uncmp_ofs = zheader_ofs;
3530 expected_cmp_ofs = zheader_ofs + 24;
3531 for (i = 0; i < n_blocks; i++)
3533 off_t desc_ofs = r->pos;
3534 unsigned long long int uncompressed_ofs;
3535 unsigned long long int compressed_ofs;
3536 unsigned int uncompressed_size;
3537 unsigned int compressed_size;
3539 if (!read_uint64 (r, &uncompressed_ofs)
3540 || !read_uint64 (r, &compressed_ofs)
3541 || !read_uint (r, &uncompressed_size)
3542 || !read_uint (r, &compressed_size))
3545 if (uncompressed_ofs != expected_uncmp_ofs)
3547 sys_error (r, desc_ofs,
3548 _("ZLIB block descriptor %u reported uncompressed data "
3549 "offset %#llx, when %#llx was expected."),
3550 i, uncompressed_ofs, expected_uncmp_ofs);
3554 if (compressed_ofs != expected_cmp_ofs)
3556 sys_error (r, desc_ofs,
3557 _("ZLIB block descriptor %u reported compressed data "
3558 "offset %#llx, when %#llx was expected."),
3559 i, compressed_ofs, expected_cmp_ofs);
3563 if (i < n_blocks - 1)
3565 if (uncompressed_size != block_size)
3566 sys_warn (r, desc_ofs,
3567 _("ZLIB block descriptor %u reported block size %#x, "
3568 "when %#x was expected."),
3569 i, uncompressed_size, block_size);
3573 if (uncompressed_size > block_size)
3574 sys_warn (r, desc_ofs,
3575 _("ZLIB block descriptor %u reported block size %#x, "
3576 "when at most %#x was expected."),
3577 i, uncompressed_size, block_size);
3580 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3581 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3582 This code checks for an expansion of more than 14.3% plus 11
3584 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3586 sys_error (r, desc_ofs,
3587 _("ZLIB block descriptor %u reports compressed size %u "
3588 "and uncompressed size %u."),
3589 i, compressed_size, uncompressed_size);
3593 expected_uncmp_ofs += uncompressed_size;
3594 expected_cmp_ofs += compressed_size;
3597 if (expected_cmp_ofs != r->ztrailer_ofs)
3599 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3600 "would be expected from block descriptors."),
3601 r->ztrailer_ofs, expected_cmp_ofs);
3605 seek (r, zheader_ofs + 24);
3610 open_zstream (struct sfm_reader *r)
3614 r->zout_pos = r->zout_end = 0;
3615 error = inflateInit (&r->zstream);
3618 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3626 close_zstream (struct sfm_reader *r)
3630 error = inflateEnd (&r->zstream);
3633 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3641 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3643 uint8_t *buf = buf_;
3652 /* Use already inflated data if there is any. */
3653 if (r->zout_pos < r->zout_end)
3655 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3656 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3665 /* We need to inflate some more data.
3666 Get some more input data if we don't have any. */
3667 if (r->zstream.avail_in == 0)
3669 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3674 int retval = try_read_bytes (r, r->zin_buf, n);
3677 r->zstream.avail_in = n;
3678 r->zstream.next_in = r->zin_buf;
3682 /* Inflate the (remaining) input data. */
3683 r->zstream.avail_out = ZOUT_BUF_SIZE;
3684 r->zstream.next_out = r->zout_buf;
3685 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3687 r->zout_end = r->zstream.next_out - r->zout_buf;
3688 if (r->zout_end == 0)
3690 if (error != Z_STREAM_END)
3692 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3696 else if (!close_zstream (r) || !open_zstream (r))
3701 /* Process the output data and ignore 'error' for now. ZLIB will
3702 present it to us again on the next inflate() call. */
3708 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3710 if (r->compression == ANY_COMP_SIMPLE)
3711 return read_bytes (r, buf, byte_cnt);
3714 int retval = read_bytes_zlib (r, buf, byte_cnt);
3716 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3722 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3724 if (r->compression == ANY_COMP_SIMPLE)
3725 return try_read_bytes (r, buf, byte_cnt);
3727 return read_bytes_zlib (r, buf, byte_cnt);
3730 /* Reads a 64-bit floating-point number from R and returns its
3731 value in host format. */
3733 read_compressed_float (struct sfm_reader *r, double *d)
3737 if (!read_compressed_bytes (r, number, sizeof number))
3740 *d = float_get_double (r->float_format, number);
3744 static const struct casereader_class sys_file_casereader_class =
3746 sys_file_casereader_read,
3747 sys_file_casereader_destroy,
3752 const struct any_reader_class sys_file_reader_class =
3754 N_("SPSS System File"),