1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-private.h"
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/message.h"
49 #include "libpspp/misc.h"
50 #include "libpspp/pool.h"
51 #include "libpspp/str.h"
52 #include "libpspp/stringi-set.h"
54 #include "gl/c-strtod.h"
55 #include "gl/c-ctype.h"
56 #include "gl/inttostr.h"
57 #include "gl/localcharset.h"
58 #include "gl/minmax.h"
59 #include "gl/unlocked-io.h"
60 #include "gl/xalloc.h"
61 #include "gl/xalloc-oversized.h"
65 #define _(msgid) gettext (msgid)
66 #define N_(msgid) (msgid)
70 /* subtypes 0-2 unknown */
71 EXT_INTEGER = 3, /* Machine integer info. */
72 EXT_FLOAT = 4, /* Machine floating-point info. */
73 EXT_VAR_SETS = 5, /* Variable sets. */
74 EXT_DATE = 6, /* DATE. */
75 EXT_MRSETS = 7, /* Multiple response sets. */
76 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
77 /* subtype 9 unknown */
78 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
79 EXT_DISPLAY = 11, /* Variable display parameters. */
80 /* subtype 12 unknown */
81 EXT_LONG_NAMES = 13, /* Long variable names. */
82 EXT_LONG_STRINGS = 14, /* Long strings. */
83 /* subtype 15 unknown */
84 EXT_NCASES = 16, /* Extended number of cases. */
85 EXT_FILE_ATTRS = 17, /* Data file attributes. */
86 EXT_VAR_ATTRS = 18, /* Variable attributes. */
87 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
88 EXT_ENCODING = 20, /* Character encoding. */
89 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
90 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
91 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
94 /* Fields from the top-level header record. */
95 struct sfm_header_record
97 char magic[5]; /* First 4 bytes of file, then null. */
98 int weight_idx; /* 0 if unweighted, otherwise a var index. */
99 int nominal_case_size; /* Number of var positions. */
101 /* These correspond to the members of struct any_file_info or a dictionary
102 but in the system file's encoding rather than ASCII. */
103 char creation_date[10]; /* "dd mmm yy". */
104 char creation_time[9]; /* "hh:mm:ss". */
105 char eye_catcher[61]; /* Eye-catcher string, then product name. */
106 char file_label[65]; /* File label. */
109 struct sfm_var_record
116 int missing_value_code;
119 struct variable *var;
122 struct sfm_value_label
128 struct sfm_value_label_record
131 struct sfm_value_label *labels;
132 unsigned int n_labels;
138 struct sfm_document_record
147 const char *name; /* Name. */
148 const char *label; /* Human-readable label for group. */
149 enum mrset_type type; /* Group type. */
150 const char **vars; /* Constituent variables' names. */
151 size_t n_vars; /* Number of constituent variables. */
154 enum mrset_md_cat_source cat_source; /* Source of category labels. */
155 bool label_from_var_label; /* 'label' taken from variable label? */
156 const char *counted; /* Counted value, as string. */
159 struct sfm_extension_record
161 int subtype; /* Record subtype. */
162 off_t pos; /* Starting offset in file. */
163 unsigned int size; /* Size of data elements. */
164 unsigned int count; /* Number of data elements. */
165 void *data; /* Contents. */
168 /* System file reader. */
171 struct any_reader any_reader;
173 /* Resource tracking. */
174 struct pool *pool; /* All system file state. */
177 struct any_read_info info;
178 struct sfm_header_record header;
179 struct sfm_var_record *vars;
181 struct sfm_value_label_record *labels;
183 struct sfm_document_record *document;
184 struct sfm_mrset *mrsets;
186 struct sfm_extension_record *extensions[32];
189 struct file_handle *fh; /* File handle. */
190 struct fh_lock *lock; /* Mutual exclusion for file handle. */
191 FILE *file; /* File stream. */
192 off_t pos; /* Position in file. */
193 bool error; /* I/O or corruption error? */
194 struct caseproto *proto; /* Format of output cases. */
197 enum integer_format integer_format; /* On-disk integer format. */
198 enum float_format float_format; /* On-disk floating point format. */
199 struct sfm_var *sfm_vars; /* Variables. */
200 size_t sfm_var_cnt; /* Number of variables. */
201 int case_cnt; /* Number of cases */
202 const char *encoding; /* String encoding. */
205 enum any_compression compression;
206 double bias; /* Compression bias, usually 100.0. */
207 uint8_t opcodes[8]; /* Current block of opcodes. */
208 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
209 bool corruption_warning; /* Warned about possible corruption? */
211 /* ZLIB decompression. */
212 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
213 #define ZIN_BUF_SIZE 4096
214 uint8_t *zin_buf; /* Inflation input buffer. */
215 #define ZOUT_BUF_SIZE 16384
216 uint8_t *zout_buf; /* Inflation output buffer. */
217 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
218 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
219 z_stream zstream; /* ZLIB inflater. */
222 static const struct casereader_class sys_file_casereader_class;
224 static struct sfm_reader *
225 sfm_reader_cast (const struct any_reader *r_)
227 assert (r_->klass == &sys_file_reader_class);
228 return UP_CAST (r_, struct sfm_reader, any_reader);
231 static bool sfm_close (struct any_reader *);
233 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
234 const struct sfm_var_record *,
237 static void sys_msg (struct sfm_reader *r, off_t, int class,
238 const char *format, va_list args)
239 PRINTF_FORMAT (4, 0);
240 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
241 PRINTF_FORMAT (3, 4);
242 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
243 PRINTF_FORMAT (3, 4);
245 static bool read_bytes (struct sfm_reader *, void *, size_t)
247 static int try_read_bytes (struct sfm_reader *, void *, size_t)
249 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
250 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
251 static bool read_int64 (struct sfm_reader *, long long int *)
253 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
255 static bool read_string (struct sfm_reader *, char *, size_t)
257 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
259 /* ZLIB compressed data handling. */
260 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
261 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
262 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
263 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
265 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
267 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
269 static bool read_compressed_float (struct sfm_reader *, double *)
272 static char *fix_line_ends (const char *);
274 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
275 static double parse_float (const struct sfm_reader *,
276 const void *data, size_t ofs);
278 static bool read_variable_record (struct sfm_reader *,
279 struct sfm_var_record *);
280 static bool read_value_label_record (struct sfm_reader *,
281 struct sfm_value_label_record *);
282 static struct sfm_document_record *read_document_record (struct sfm_reader *);
283 static bool read_extension_record (struct sfm_reader *, int subtype,
284 struct sfm_extension_record **);
285 static bool skip_extension_record (struct sfm_reader *, int subtype);
287 static struct text_record *open_text_record (
288 struct sfm_reader *, const struct sfm_extension_record *,
289 bool recode_to_utf8);
290 static void close_text_record (struct sfm_reader *,
291 struct text_record *);
292 static bool read_variable_to_value_pair (struct sfm_reader *,
294 struct text_record *,
295 struct variable **var, char **value);
296 static void text_warn (struct sfm_reader *r, struct text_record *text,
297 const char *format, ...)
298 PRINTF_FORMAT (3, 4);
299 static char *text_get_token (struct text_record *,
300 struct substring delimiters, char *delimiter);
301 static bool text_match (struct text_record *, char c);
302 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
303 struct text_record *,
304 struct substring delimiters,
306 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
307 struct text_record *,
308 struct substring delimiters,
310 static const char *text_parse_counted_string (struct sfm_reader *,
311 struct text_record *);
312 static size_t text_pos (const struct text_record *);
313 static const char *text_get_all (const struct text_record *);
315 /* Dictionary reader. */
323 static bool read_dictionary (struct sfm_reader *);
324 static bool read_record (struct sfm_reader *, int type,
325 size_t *allocated_vars, size_t *allocated_labels);
326 static bool read_header (struct sfm_reader *, struct any_read_info *,
327 struct sfm_header_record *);
328 static void parse_header (struct sfm_reader *,
329 const struct sfm_header_record *,
330 struct any_read_info *, struct dictionary *);
331 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
332 struct sfm_var_record *, size_t n);
333 static void parse_format_spec (struct sfm_reader *, off_t pos,
334 unsigned int format, enum which_format,
335 struct variable *, int *format_warning_cnt);
336 static void parse_document (struct dictionary *, struct sfm_document_record *);
337 static void parse_display_parameters (struct sfm_reader *,
338 const struct sfm_extension_record *,
339 struct dictionary *);
340 static bool parse_machine_integer_info (struct sfm_reader *,
341 const struct sfm_extension_record *,
342 struct any_read_info *);
343 static void parse_machine_float_info (struct sfm_reader *,
344 const struct sfm_extension_record *);
345 static void parse_extra_product_info (struct sfm_reader *,
346 const struct sfm_extension_record *,
347 struct any_read_info *);
348 static void parse_mrsets (struct sfm_reader *,
349 const struct sfm_extension_record *,
350 size_t *allocated_mrsets);
351 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
352 static void parse_long_var_name_map (struct sfm_reader *,
353 const struct sfm_extension_record *,
354 struct dictionary *);
355 static bool parse_long_string_map (struct sfm_reader *,
356 const struct sfm_extension_record *,
357 struct dictionary *);
358 static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
359 const struct sfm_var_record *,
361 const struct sfm_value_label_record *);
362 static void parse_data_file_attributes (struct sfm_reader *,
363 const struct sfm_extension_record *,
364 struct dictionary *);
365 static void parse_variable_attributes (struct sfm_reader *,
366 const struct sfm_extension_record *,
367 struct dictionary *);
368 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
369 static bool parse_long_string_value_labels (struct sfm_reader *,
370 const struct sfm_extension_record *,
371 struct dictionary *);
372 static bool parse_long_string_missing_values (
373 struct sfm_reader *, const struct sfm_extension_record *,
374 struct dictionary *);
376 /* Frees the strings inside INFO. */
378 any_read_info_destroy (struct any_read_info *info)
382 free (info->creation_date);
383 free (info->creation_time);
384 free (info->product);
385 free (info->product_ext);
389 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
390 successful, otherwise NULL. */
391 static struct any_reader *
392 sfm_open (struct file_handle *fh)
394 size_t allocated_mrsets = 0;
395 struct sfm_reader *r;
397 /* Create and initialize reader. */
398 r = xzalloc (sizeof *r);
399 r->any_reader.klass = &sys_file_reader_class;
400 r->pool = pool_create ();
401 pool_register (r->pool, free, r);
403 r->opcode_idx = sizeof r->opcodes;
405 /* TRANSLATORS: this fragment will be interpolated into
406 messages in fh_lock() that identify types of files. */
407 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
411 r->file = fn_open (fh_get_file_name (fh), "rb");
414 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
415 fh_get_file_name (r->fh), strerror (errno));
419 if (!read_dictionary (r))
422 if (r->extensions[EXT_MRSETS] != NULL)
423 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
425 if (r->extensions[EXT_MRSETS2] != NULL)
426 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
428 return &r->any_reader;
432 sfm_close (&r->any_reader);
437 read_dictionary (struct sfm_reader *r)
439 size_t allocated_vars;
440 size_t allocated_labels;
442 if (!read_header (r, &r->info, &r->header))
446 allocated_labels = 0;
451 if (!read_int (r, &type))
455 if (!read_record (r, type, &allocated_vars, &allocated_labels))
459 if (!skip_bytes (r, 4))
462 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
469 read_record (struct sfm_reader *r, int type,
470 size_t *allocated_vars, size_t *allocated_labels)
477 if (r->n_vars >= *allocated_vars)
478 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
480 return read_variable_record (r, &r->vars[r->n_vars++]);
483 if (r->n_labels >= *allocated_labels)
484 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
486 return read_value_label_record (r, &r->labels[r->n_labels++]);
489 /* A Type 4 record is always immediately after a type 3 record,
490 so the code for type 3 records reads the type 4 record too. */
491 sys_error (r, r->pos, _("Misplaced type 4 record."));
495 if (r->document != NULL)
497 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
500 r->document = read_document_record (r);
501 return r->document != NULL;
504 if (!read_int (r, &subtype))
507 || subtype >= sizeof r->extensions / sizeof *r->extensions)
510 _("Unrecognized record type 7, subtype %d. Please "
511 "send a copy of this file, and the syntax which "
512 "created it to %s."),
513 subtype, PACKAGE_BUGREPORT);
514 return skip_extension_record (r, subtype);
516 else if (r->extensions[subtype] != NULL)
519 _("Record type 7, subtype %d found here has the same "
520 "type as the record found near offset 0x%llx. "
521 "Please send a copy of this file, and the syntax "
522 "which created it to %s."),
523 subtype, (long long int) r->extensions[subtype]->pos,
525 return skip_extension_record (r, subtype);
528 return read_extension_record (r, subtype, &r->extensions[subtype]);
531 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
538 /* Returns the character encoding obtained from R, or a null pointer if R
539 doesn't have an indication of its character encoding. */
541 sfm_get_encoding (const struct sfm_reader *r)
543 /* The EXT_ENCODING record is the best way to determine dictionary
545 if (r->extensions[EXT_ENCODING])
546 return r->extensions[EXT_ENCODING]->data;
548 /* But EXT_INTEGER is better than nothing as a fallback. */
549 if (r->extensions[EXT_INTEGER])
551 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
552 const char *encoding;
561 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
562 respectively. However, many files have character code 2 but data
563 which are clearly not ASCII. Therefore, ignore these values. */
570 encoding = sys_get_encoding_from_codepage (codepage);
571 if (encoding != NULL)
577 /* If the file magic number is EBCDIC then its character data is too. */
578 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
584 struct get_strings_aux
595 add_string__ (struct get_strings_aux *aux,
596 const char *string, bool id, char *title)
598 if (aux->n >= aux->allocated)
600 aux->allocated = 2 * (aux->allocated + 1);
601 aux->titles = pool_realloc (aux->pool, aux->titles,
602 aux->allocated * sizeof *aux->titles);
603 aux->strings = pool_realloc (aux->pool, aux->strings,
604 aux->allocated * sizeof *aux->strings);
605 aux->ids = pool_realloc (aux->pool, aux->ids,
606 aux->allocated * sizeof *aux->ids);
609 aux->titles[aux->n] = title;
610 aux->strings[aux->n] = pool_strdup (aux->pool, string);
611 aux->ids[aux->n] = id;
615 static void PRINTF_FORMAT (3, 4)
616 add_string (struct get_strings_aux *aux,
617 const char *string, const char *title, ...)
621 va_start (args, title);
622 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
626 static void PRINTF_FORMAT (3, 4)
627 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
631 va_start (args, title);
632 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
636 /* Retrieves significant string data from R in its raw format, to allow the
637 caller to try to detect the encoding in use.
639 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
640 and *STRINGSP to an array of N elements allocated from POOL. For each I in
641 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
642 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
643 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
646 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
647 char ***titlesp, bool **idsp, char ***stringsp)
649 struct sfm_reader *r = sfm_reader_cast (r_);
650 const struct sfm_mrset *mrset;
651 struct get_strings_aux aux;
663 for (i = 0; i < r->n_vars; i++)
664 if (r->vars[i].width != -1)
665 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
668 for (i = 0; i < r->n_vars; i++)
669 if (r->vars[i].width != -1)
672 if (r->vars[i].label)
673 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
678 for (i = 0; i < r->n_labels; i++)
679 for (j = 0; j < r->labels[i].n_labels; j++)
680 add_string (&aux, r->labels[i].labels[j].label,
681 _("Value Label %zu"), k++);
683 add_string (&aux, r->header.creation_date, _("Creation Date"));
684 add_string (&aux, r->header.creation_time, _("Creation Time"));
685 add_string (&aux, r->header.eye_catcher, _("Product"));
686 add_string (&aux, r->header.file_label, _("File Label"));
688 if (r->extensions[EXT_PRODUCT_INFO])
689 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
690 _("Extra Product Info"));
696 for (i = 0; i < r->document->n_lines; i++)
700 memcpy (line, r->document->documents + i * 80, 80);
703 add_string (&aux, line, _("Document Line %zu"), i + 1);
707 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
709 size_t mrset_idx = mrset - r->mrsets + 1;
711 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
713 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
715 /* Skip the variables because they ought to be duplicates. */
718 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
723 /* data file attributes */
724 /* variable attributes */
726 /* long string value labels */
727 /* long string missing values */
729 *titlesp = aux.titles;
731 *stringsp = aux.strings;
735 /* Decodes the dictionary read from R, saving it into into *DICT. Character
736 strings in R are decoded using ENCODING, or an encoding obtained from R if
737 ENCODING is null, or the locale encoding if R specifies no encoding.
739 If INFOP is non-null, then it receives additional info about the system
740 file, which the caller must eventually free with any_read_info_destroy()
741 when it is no longer needed.
743 This function consumes R. The caller must use it again later, even to
744 destroy it with sfm_close(). */
745 static struct casereader *
746 sfm_decode (struct any_reader *r_, const char *encoding,
747 struct dictionary **dictp, struct any_read_info *infop)
749 struct sfm_reader *r = sfm_reader_cast (r_);
750 struct dictionary *dict;
753 if (encoding == NULL)
755 encoding = sfm_get_encoding (r);
756 if (encoding == NULL)
758 sys_warn (r, -1, _("This system file does not indicate its own "
759 "character encoding. Using default encoding "
760 "%s. For best results, specify an encoding "
761 "explicitly. Use SYSFILE INFO with "
762 "ENCODING=\"DETECT\" to analyze the possible "
765 encoding = locale_charset ();
769 dict = dict_create (encoding);
770 r->encoding = dict_get_encoding (dict);
772 /* These records don't use variables at all. */
773 if (r->document != NULL)
774 parse_document (dict, r->document);
776 if (r->extensions[EXT_INTEGER] != NULL
777 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
780 if (r->extensions[EXT_FLOAT] != NULL)
781 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
783 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
784 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
786 if (r->extensions[EXT_FILE_ATTRS] != NULL)
787 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
789 parse_header (r, &r->header, &r->info, dict);
791 /* Parse the variable records, the basis of almost everything else. */
792 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
795 /* Parse value labels and the weight variable immediately after the variable
796 records. These records use indexes into var_recs[], so we must parse them
797 before those indexes become invalidated by very long string variables. */
798 for (i = 0; i < r->n_labels; i++)
799 if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i]))
801 if (r->header.weight_idx != 0)
803 struct variable *weight_var;
805 weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars,
806 r->header.weight_idx);
807 if (weight_var != NULL)
809 if (var_is_numeric (weight_var))
810 dict_set_weight (dict, weight_var);
812 sys_warn (r, -1, _("Ignoring string variable `%s' set "
813 "as weighting variable."),
814 var_get_name (weight_var));
818 if (r->extensions[EXT_DISPLAY] != NULL)
819 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
821 /* The following records use short names, so they need to be parsed before
822 parse_long_var_name_map() changes short names to long names. */
823 decode_mrsets (r, dict);
825 if (r->extensions[EXT_LONG_STRINGS] != NULL
826 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
829 /* Now rename variables to their long names. */
830 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
832 /* The following records use long names, so they need to follow renaming. */
833 if (r->extensions[EXT_VAR_ATTRS] != NULL)
835 parse_variable_attributes (r, r->extensions[EXT_VAR_ATTRS], dict);
837 /* Roles use the $@Role attribute. */
838 assign_variable_roles (r, dict);
841 if (r->extensions[EXT_LONG_LABELS] != NULL
842 && !parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS],
845 if (r->extensions[EXT_LONG_MISSING] != NULL
846 && !parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
850 /* Warn if the actual amount of data per case differs from the
851 amount that the header claims. SPSS version 13 gets this
852 wrong when very long strings are involved, so don't warn in
854 if (r->header.nominal_case_size != -1
855 && r->header.nominal_case_size != r->n_vars
856 && r->info.version_major != 13)
857 sys_warn (r, -1, _("File header claims %d variable positions but "
858 "%zu were read from file."),
859 r->header.nominal_case_size, r->n_vars);
861 /* Create an index of dictionary variable widths for
862 sfm_read_case to use. We cannot use the `struct variable's
863 from the dictionary we created, because the caller owns the
864 dictionary and may destroy or modify its variables. */
865 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
866 pool_register (r->pool, free, r->sfm_vars);
867 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
873 memset (&r->info, 0, sizeof r->info);
876 return casereader_create_sequential
878 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
879 &sys_file_casereader_class, r);
888 /* Closes R, which should have been returned by sfm_open() but not already
889 closed with sfm_decode() or this function.
890 Returns true if an I/O error has occurred on READER, false
893 sfm_close (struct any_reader *r_)
895 struct sfm_reader *r = sfm_reader_cast (r_);
900 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
902 msg (ME, _("Error closing system file `%s': %s."),
903 fh_get_file_name (r->fh), strerror (errno));
909 any_read_info_destroy (&r->info);
914 pool_destroy (r->pool);
919 /* Destroys READER. */
921 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
923 struct sfm_reader *r = r_;
924 sfm_close (&r->any_reader);
927 /* Returns 1 if FILE is an SPSS system file,
929 otherwise a negative errno value. */
931 sfm_detect (FILE *file)
935 if (fseek (file, 0, SEEK_SET) != 0)
937 if (fread (magic, 4, 1, file) != 1)
938 return feof (file) ? 0 : -errno;
941 return (!strcmp (ASCII_MAGIC, magic)
942 || !strcmp (ASCII_ZMAGIC, magic)
943 || !strcmp (EBCDIC_MAGIC, magic));
946 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
947 except for the string fields in *INFO, which parse_header() will initialize
948 later once the file's encoding is known. */
950 read_header (struct sfm_reader *r, struct any_read_info *info,
951 struct sfm_header_record *header)
953 uint8_t raw_layout_code[4];
958 if (!read_string (r, header->magic, sizeof header->magic)
959 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
962 if (!strcmp (ASCII_MAGIC, header->magic)
963 || !strcmp (EBCDIC_MAGIC, header->magic))
965 else if (!strcmp (ASCII_ZMAGIC, header->magic))
969 sys_error (r, 0, _("This is not an SPSS system file."));
973 /* Identify integer format. */
974 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
976 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
978 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
980 || (r->integer_format != INTEGER_MSB_FIRST
981 && r->integer_format != INTEGER_LSB_FIRST))
983 sys_error (r, 64, _("This is not an SPSS system file."));
987 if (!read_int (r, &header->nominal_case_size))
990 if (header->nominal_case_size < 0
991 || header->nominal_case_size > INT_MAX / 16)
992 header->nominal_case_size = -1;
994 if (!read_int (r, &compressed))
999 r->compression = ANY_COMP_NONE;
1000 else if (compressed == 1)
1001 r->compression = ANY_COMP_SIMPLE;
1002 else if (compressed != 0)
1004 sys_error (r, 0, "System file header has invalid compression "
1005 "value %d.", compressed);
1011 if (compressed == 2)
1012 r->compression = ANY_COMP_ZLIB;
1015 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1016 "compression value %d.", compressed);
1021 if (!read_int (r, &header->weight_idx))
1024 if (!read_int (r, &r->case_cnt))
1026 if ( r->case_cnt > INT_MAX / 2)
1029 /* Identify floating-point format and obtain compression bias. */
1030 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1032 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1034 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1036 if (memcmp (raw_bias, zero_bias, 8))
1037 sys_warn (r, r->pos - 8,
1038 _("Compression bias is not the usual "
1039 "value of 100, or system file uses unrecognized "
1040 "floating-point format."));
1043 /* Some software is known to write all-zeros to this
1044 field. Such software also writes floating-point
1045 numbers in the format that we expect by default
1046 (it seems that all software most likely does, in
1047 reality), so don't warn in this case. */
1050 if (r->integer_format == INTEGER_MSB_FIRST)
1051 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1053 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1055 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1057 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1058 || !read_string (r, header->creation_time, sizeof header->creation_time)
1059 || !read_string (r, header->file_label, sizeof header->file_label)
1060 || !skip_bytes (r, 3))
1063 info->integer_format = r->integer_format;
1064 info->float_format = r->float_format;
1065 info->compression = r->compression;
1066 info->case_cnt = r->case_cnt;
1071 /* Reads a variable (type 2) record from R into RECORD. */
1073 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1075 int has_variable_label;
1077 memset (record, 0, sizeof *record);
1079 record->pos = r->pos;
1080 if (!read_int (r, &record->width)
1081 || !read_int (r, &has_variable_label)
1082 || !read_int (r, &record->missing_value_code)
1083 || !read_int (r, &record->print_format)
1084 || !read_int (r, &record->write_format)
1085 || !read_string (r, record->name, sizeof record->name))
1088 if (has_variable_label == 1)
1090 enum { MAX_LABEL_LEN = 65536 };
1091 unsigned int len, read_len;
1093 if (!read_uint (r, &len))
1096 /* Read up to MAX_LABEL_LEN bytes of label. */
1097 read_len = MIN (MAX_LABEL_LEN, len);
1098 record->label = pool_malloc (r->pool, read_len + 1);
1099 if (!read_string (r, record->label, read_len + 1))
1102 /* Skip unread label bytes. */
1103 if (!skip_bytes (r, len - read_len))
1106 /* Skip label padding up to multiple of 4 bytes. */
1107 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1110 else if (has_variable_label != 0)
1112 sys_error (r, record->pos,
1113 _("Variable label indicator field is not 0 or 1."));
1117 /* Set missing values. */
1118 if (record->missing_value_code != 0)
1120 int code = record->missing_value_code;
1121 if (record->width == 0)
1123 if (code < -3 || code > 3 || code == -1)
1125 sys_error (r, record->pos,
1126 _("Numeric missing value indicator field is not "
1127 "-3, -2, 0, 1, 2, or 3."));
1133 if (code < 1 || code > 3)
1135 sys_error (r, record->pos,
1136 _("String missing value indicator field is not "
1142 if (!read_bytes (r, record->missing, 8 * abs (code)))
1149 /* Reads value labels from R into RECORD. */
1151 read_value_label_record (struct sfm_reader *r,
1152 struct sfm_value_label_record *record)
1157 /* Read type 3 record. */
1158 record->pos = r->pos;
1159 if (!read_uint (r, &record->n_labels))
1161 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1163 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1167 record->labels = pool_nmalloc (r->pool, record->n_labels,
1168 sizeof *record->labels);
1169 for (i = 0; i < record->n_labels; i++)
1171 struct sfm_value_label *label = &record->labels[i];
1172 unsigned char label_len;
1175 if (!read_bytes (r, label->value, sizeof label->value))
1178 /* Read label length. */
1179 if (!read_bytes (r, &label_len, sizeof label_len))
1181 padded_len = ROUND_UP (label_len + 1, 8);
1183 /* Read label, padding. */
1184 label->label = pool_malloc (r->pool, padded_len + 1);
1185 if (!read_bytes (r, label->label, padded_len - 1))
1187 label->label[label_len] = '\0';
1190 /* Read record type of type 4 record. */
1191 if (!read_int (r, &type))
1195 sys_error (r, r->pos - 4,
1196 _("Variable index record (type 4) does not immediately "
1197 "follow value label record (type 3) as it should."));
1201 /* Read number of variables associated with value label from type 4
1203 if (!read_uint (r, &record->n_vars))
1205 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1207 sys_error (r, r->pos - 4,
1208 _("Number of variables associated with a value label (%u) "
1209 "is not between 1 and the number of variables (%zu)."),
1210 record->n_vars, r->n_vars);
1214 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1215 for (i = 0; i < record->n_vars; i++)
1216 if (!read_int (r, &record->vars[i]))
1222 /* Reads a document record from R and returns it. */
1223 static struct sfm_document_record *
1224 read_document_record (struct sfm_reader *r)
1226 struct sfm_document_record *record;
1229 record = pool_malloc (r->pool, sizeof *record);
1230 record->pos = r->pos;
1232 if (!read_int (r, &n_lines))
1234 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1236 sys_error (r, record->pos,
1237 _("Number of document lines (%d) "
1238 "must be greater than 0 and less than %d."),
1239 n_lines, INT_MAX / DOC_LINE_LENGTH);
1243 record->n_lines = n_lines;
1244 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1245 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1252 read_extension_record_header (struct sfm_reader *r, int subtype,
1253 struct sfm_extension_record *record)
1255 record->subtype = subtype;
1256 record->pos = r->pos;
1257 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1260 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1261 allows an extra byte for a null terminator, used by some
1262 extension processing routines. */
1263 if (record->size != 0
1264 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1266 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1274 /* Reads an extension record from R into RECORD. */
1276 read_extension_record (struct sfm_reader *r, int subtype,
1277 struct sfm_extension_record **recordp)
1279 struct extension_record_type
1286 static const struct extension_record_type types[] =
1288 /* Implemented record types. */
1289 { EXT_INTEGER, 4, 8 },
1290 { EXT_FLOAT, 8, 3 },
1291 { EXT_MRSETS, 1, 0 },
1292 { EXT_PRODUCT_INFO, 1, 0 },
1293 { EXT_DISPLAY, 4, 0 },
1294 { EXT_LONG_NAMES, 1, 0 },
1295 { EXT_LONG_STRINGS, 1, 0 },
1296 { EXT_NCASES, 8, 2 },
1297 { EXT_FILE_ATTRS, 1, 0 },
1298 { EXT_VAR_ATTRS, 1, 0 },
1299 { EXT_MRSETS2, 1, 0 },
1300 { EXT_ENCODING, 1, 0 },
1301 { EXT_LONG_LABELS, 1, 0 },
1302 { EXT_LONG_MISSING, 1, 0 },
1304 /* Ignored record types. */
1305 { EXT_VAR_SETS, 0, 0 },
1307 { EXT_DATA_ENTRY, 0, 0 },
1308 { EXT_DATAVIEW, 0, 0 },
1311 const struct extension_record_type *type;
1312 struct sfm_extension_record *record;
1316 record = pool_malloc (r->pool, sizeof *record);
1317 if (!read_extension_record_header (r, subtype, record))
1319 n_bytes = record->count * record->size;
1321 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1322 if (subtype == type->subtype)
1324 if (type->size > 0 && record->size != type->size)
1325 sys_warn (r, record->pos,
1326 _("Record type 7, subtype %d has bad size %u "
1327 "(expected %d)."), subtype, record->size, type->size);
1328 else if (type->count > 0 && record->count != type->count)
1329 sys_warn (r, record->pos,
1330 _("Record type 7, subtype %d has bad count %u "
1331 "(expected %d)."), subtype, record->count, type->count);
1332 else if (type->count == 0 && type->size == 0)
1334 /* Ignore this record. */
1338 char *data = pool_malloc (r->pool, n_bytes + 1);
1339 data[n_bytes] = '\0';
1341 record->data = data;
1342 if (!read_bytes (r, record->data, n_bytes))
1351 sys_warn (r, record->pos,
1352 _("Unrecognized record type 7, subtype %d. Please send a "
1353 "copy of this file, and the syntax which created it to %s."),
1354 subtype, PACKAGE_BUGREPORT);
1357 return skip_bytes (r, n_bytes);
1361 skip_extension_record (struct sfm_reader *r, int subtype)
1363 struct sfm_extension_record record;
1365 return (read_extension_record_header (r, subtype, &record)
1366 && skip_bytes (r, record.count * record.size));
1370 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1371 struct any_read_info *info, struct dictionary *dict)
1373 const char *dict_encoding = dict_get_encoding (dict);
1374 struct substring product;
1375 struct substring label;
1378 /* Convert file label to UTF-8 and put it into DICT. */
1379 label = recode_substring_pool ("UTF-8", dict_encoding,
1380 ss_cstr (header->file_label), r->pool);
1381 ss_trim (&label, ss_cstr (" "));
1382 label.string[label.length] = '\0';
1383 fixed_label = fix_line_ends (label.string);
1384 dict_set_label (dict, fixed_label);
1387 /* Put creation date and time in UTF-8 into INFO. */
1388 info->creation_date = recode_string ("UTF-8", dict_encoding,
1389 header->creation_date, -1);
1390 info->creation_time = recode_string ("UTF-8", dict_encoding,
1391 header->creation_time, -1);
1393 /* Put product name into INFO, dropping eye-catcher string if present. */
1394 product = recode_substring_pool ("UTF-8", dict_encoding,
1395 ss_cstr (header->eye_catcher), r->pool);
1396 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1397 ss_trim (&product, ss_cstr (" "));
1398 info->product = ss_xstrdup (product);
1401 /* Reads a variable (type 2) record from R and adds the
1402 corresponding variable to DICT.
1403 Also skips past additional variable records for long string
1406 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1407 struct sfm_var_record *var_recs, size_t n_var_recs)
1409 const char *dict_encoding = dict_get_encoding (dict);
1410 struct sfm_var_record *rec;
1413 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1415 struct variable *var;
1420 name = recode_string_pool ("UTF-8", dict_encoding,
1421 rec->name, -1, r->pool);
1422 name[strcspn (name, " ")] = '\0';
1424 if (!dict_id_is_valid (dict, name, false)
1425 || name[0] == '$' || name[0] == '#')
1427 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1431 if (rec->width < 0 || rec->width > 255)
1433 sys_error (r, rec->pos,
1434 _("Bad width %d for variable %s."), rec->width, name);
1438 var = rec->var = dict_create_var (dict, name, rec->width);
1441 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1442 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1445 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1449 /* Set the short name the same as the long name. */
1450 var_set_short_name (var, 0, name);
1452 /* Get variable label, if any. */
1457 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1458 rec->label, -1, r->pool);
1459 var_set_label (var, utf8_label);
1462 /* Set missing values. */
1463 if (rec->missing_value_code != 0)
1465 int width = var_get_width (var);
1466 struct missing_values mv;
1468 mv_init_pool (r->pool, &mv, width);
1469 if (var_is_numeric (var))
1471 bool has_range = rec->missing_value_code < 0;
1472 int n_discrete = (has_range
1473 ? rec->missing_value_code == -3
1474 : rec->missing_value_code);
1479 double low = parse_float (r, rec->missing, 0);
1480 double high = parse_float (r, rec->missing, 8);
1482 /* Deal with SPSS 21 change in representation. */
1486 mv_add_range (&mv, low, high);
1490 for (i = 0; i < n_discrete; i++)
1492 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1497 for (i = 0; i < rec->missing_value_code; i++)
1498 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1499 var_set_missing_values (var, &mv);
1503 parse_format_spec (r, rec->pos + 12, rec->print_format,
1504 PRINT_FORMAT, var, &n_warnings);
1505 parse_format_spec (r, rec->pos + 16, rec->write_format,
1506 WRITE_FORMAT, var, &n_warnings);
1508 /* Account for values.
1509 Skip long string continuation records, if any. */
1510 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1511 for (i = 1; i < n_values; i++)
1512 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1514 sys_error (r, rec->pos, _("Missing string continuation record."));
1523 /* Translates the format spec from sysfile format to internal
1526 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1527 enum which_format which, struct variable *v,
1530 const int max_warnings = 8;
1531 uint8_t raw_type = format >> 16;
1532 uint8_t w = format >> 8;
1541 ok = (fmt_from_io (raw_type, &f.type)
1542 && fmt_check_output (&f)
1543 && fmt_check_width_compat (&f, var_get_width (v)));
1548 if (which == PRINT_FORMAT)
1549 var_set_print_format (v, &f);
1551 var_set_write_format (v, &f);
1553 else if (format == 0)
1555 /* Actually observed in the wild. No point in warning about it. */
1557 else if (++*n_warnings <= max_warnings)
1559 if (which == PRINT_FORMAT)
1560 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1562 var_get_name (v), var_get_width (v), format);
1564 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1566 var_get_name (v), var_get_width (v), format);
1568 if (*n_warnings == max_warnings)
1569 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1574 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1578 for (p = record->documents;
1579 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1580 p += DOC_LINE_LENGTH)
1582 struct substring line;
1584 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1585 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1586 ss_rtrim (&line, ss_cstr (" "));
1587 line.string[line.length] = '\0';
1589 dict_add_document_line (dict, line.string, false);
1595 /* Parses record type 7, subtype 3. */
1597 parse_machine_integer_info (struct sfm_reader *r,
1598 const struct sfm_extension_record *record,
1599 struct any_read_info *info)
1601 int float_representation, expected_float_format;
1602 int integer_representation, expected_integer_format;
1604 /* Save version info. */
1605 info->version_major = parse_int (r, record->data, 0);
1606 info->version_minor = parse_int (r, record->data, 4);
1607 info->version_revision = parse_int (r, record->data, 8);
1609 /* Check floating point format. */
1610 float_representation = parse_int (r, record->data, 16);
1611 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1612 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1613 expected_float_format = 1;
1614 else if (r->float_format == FLOAT_Z_LONG)
1615 expected_float_format = 2;
1616 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1617 expected_float_format = 3;
1620 if (float_representation != expected_float_format)
1622 sys_error (r, record->pos,
1623 _("Floating-point representation indicated by "
1624 "system file (%d) differs from expected (%d)."),
1625 float_representation, expected_float_format);
1629 /* Check integer format. */
1630 integer_representation = parse_int (r, record->data, 24);
1631 if (r->integer_format == INTEGER_MSB_FIRST)
1632 expected_integer_format = 1;
1633 else if (r->integer_format == INTEGER_LSB_FIRST)
1634 expected_integer_format = 2;
1637 if (integer_representation != expected_integer_format)
1638 sys_warn (r, record->pos,
1639 _("Integer format indicated by system file (%d) "
1640 "differs from expected (%d)."),
1641 integer_representation, expected_integer_format);
1646 /* Parses record type 7, subtype 4. */
1648 parse_machine_float_info (struct sfm_reader *r,
1649 const struct sfm_extension_record *record)
1651 double sysmis = parse_float (r, record->data, 0);
1652 double highest = parse_float (r, record->data, 8);
1653 double lowest = parse_float (r, record->data, 16);
1655 if (sysmis != SYSMIS)
1656 sys_warn (r, record->pos,
1657 _("File specifies unexpected value %g (%a) as %s, "
1658 "instead of %g (%a)."),
1659 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1661 if (highest != HIGHEST)
1662 sys_warn (r, record->pos,
1663 _("File specifies unexpected value %g (%a) as %s, "
1664 "instead of %g (%a)."),
1665 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1667 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1668 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1669 appears in a context (missing values) where SYSMIS cannot. */
1670 if (lowest != LOWEST && lowest != SYSMIS)
1671 sys_warn (r, record->pos,
1672 _("File specifies unexpected value %g (%a) as %s, "
1673 "instead of %g (%a) or %g (%a)."),
1674 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1677 /* Parses record type 7, subtype 10. */
1679 parse_extra_product_info (struct sfm_reader *r,
1680 const struct sfm_extension_record *record,
1681 struct any_read_info *info)
1683 struct text_record *text;
1685 text = open_text_record (r, record, true);
1686 info->product_ext = fix_line_ends (text_get_all (text));
1687 close_text_record (r, text);
1690 /* Parses record type 7, subtype 7 or 19. */
1692 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1693 size_t *allocated_mrsets)
1695 struct text_record *text;
1697 text = open_text_record (r, record, false);
1700 struct sfm_mrset *mrset;
1701 size_t allocated_vars;
1704 /* Skip extra line feeds if present. */
1705 while (text_match (text, '\n'))
1708 if (r->n_mrsets >= *allocated_mrsets)
1709 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1711 mrset = &r->mrsets[r->n_mrsets];
1712 memset(mrset, 0, sizeof *mrset);
1714 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1715 if (mrset->name == NULL)
1718 if (text_match (text, 'C'))
1720 mrset->type = MRSET_MC;
1721 if (!text_match (text, ' '))
1723 sys_warn (r, record->pos,
1724 _("Missing space following `%c' at offset %zu "
1725 "in MRSETS record."), 'C', text_pos (text));
1729 else if (text_match (text, 'D'))
1731 mrset->type = MRSET_MD;
1732 mrset->cat_source = MRSET_VARLABELS;
1734 else if (text_match (text, 'E'))
1738 mrset->type = MRSET_MD;
1739 mrset->cat_source = MRSET_COUNTEDVALUES;
1740 if (!text_match (text, ' '))
1742 sys_warn (r, record->pos,
1743 _("Missing space following `%c' at offset %zu "
1744 "in MRSETS record."), 'E', text_pos (text));
1748 number = text_get_token (text, ss_cstr (" "), NULL);
1749 if (!strcmp (number, "11"))
1750 mrset->label_from_var_label = true;
1751 else if (strcmp (number, "1"))
1752 sys_warn (r, record->pos,
1753 _("Unexpected label source value following `E' "
1754 "at offset %zu in MRSETS record."),
1759 sys_warn (r, record->pos,
1760 _("Missing `C', `D', or `E' at offset %zu "
1761 "in MRSETS record."),
1766 if (mrset->type == MRSET_MD)
1768 mrset->counted = text_parse_counted_string (r, text);
1769 if (mrset->counted == NULL)
1773 mrset->label = text_parse_counted_string (r, text);
1774 if (mrset->label == NULL)
1782 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1785 if (delimiter != '\n')
1786 sys_warn (r, record->pos,
1787 _("Missing new-line parsing variable names "
1788 "at offset %zu in MRSETS record."),
1793 if (mrset->n_vars >= allocated_vars)
1794 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1796 sizeof *mrset->vars);
1797 mrset->vars[mrset->n_vars++] = var;
1799 while (delimiter != '\n');
1803 close_text_record (r, text);
1807 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1809 const struct sfm_mrset *s;
1811 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1813 struct stringi_set var_names;
1814 struct mrset *mrset;
1819 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1822 sys_warn (r, -1, _("Multiple response set name `%s' does not begin "
1829 mrset = xzalloc (sizeof *mrset);
1831 mrset->type = s->type;
1832 mrset->cat_source = s->cat_source;
1833 mrset->label_from_var_label = s->label_from_var_label;
1834 if (s->label[0] != '\0')
1835 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1837 stringi_set_init (&var_names);
1838 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1840 for (i = 0; i < s->n_vars; i++)
1842 struct variable *var;
1845 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1847 var = dict_lookup_var (dict, var_name);
1853 if (!stringi_set_insert (&var_names, var_name))
1856 _("MRSET %s contains duplicate variable name %s."),
1857 mrset->name, var_name);
1863 if (mrset->label == NULL && mrset->label_from_var_label
1864 && var_has_label (var))
1865 mrset->label = xstrdup (var_get_label (var));
1868 && var_get_type (var) != var_get_type (mrset->vars[0]))
1871 _("MRSET %s contains both string and "
1872 "numeric variables."), mrset->name);
1875 width = MIN (width, var_get_width (var));
1877 mrset->vars[mrset->n_vars++] = var;
1880 if (mrset->n_vars < 2)
1882 if (mrset->n_vars == 0)
1883 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1885 sys_warn (r, -1, _("MRSET %s has only one variable."),
1887 mrset_destroy (mrset);
1888 stringi_set_destroy (&var_names);
1892 if (mrset->type == MRSET_MD)
1894 mrset->width = width;
1895 value_init (&mrset->counted, width);
1897 mrset->counted.f = c_strtod (s->counted, NULL);
1899 value_copy_str_rpad (&mrset->counted, width,
1900 (const uint8_t *) s->counted, ' ');
1903 dict_add_mrset (dict, mrset);
1904 stringi_set_destroy (&var_names);
1908 /* Read record type 7, subtype 11, which specifies how variables
1909 should be displayed in GUI environments. */
1911 parse_display_parameters (struct sfm_reader *r,
1912 const struct sfm_extension_record *record,
1913 struct dictionary *dict)
1915 bool includes_width;
1916 bool warned = false;
1921 n_vars = dict_get_var_cnt (dict);
1922 if (record->count == 3 * n_vars)
1923 includes_width = true;
1924 else if (record->count == 2 * n_vars)
1925 includes_width = false;
1928 sys_warn (r, record->pos,
1929 _("Extension 11 has bad count %u (for %zu variables)."),
1930 record->count, n_vars);
1935 for (i = 0; i < n_vars; ++i)
1937 struct variable *v = dict_get_var (dict, i);
1938 int measure, width, align;
1940 measure = parse_int (r, record->data, ofs);
1945 width = parse_int (r, record->data, ofs);
1951 align = parse_int (r, record->data, ofs);
1954 /* SPSS sometimes seems to set variables' measure to zero. */
1958 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1961 sys_warn (r, record->pos,
1962 _("Invalid variable display parameters for variable "
1963 "%zu (%s). Default parameters substituted."),
1964 i, var_get_name (v));
1969 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1970 : measure == 2 ? MEASURE_ORDINAL
1972 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1973 : align == 1 ? ALIGN_RIGHT
1976 /* Older versions (SPSS 9.0) sometimes set the display
1977 width to zero. This causes confusion in the GUI, so
1978 only set the width if it is nonzero. */
1980 var_set_display_width (v, width);
1985 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1986 const char *new_name)
1988 size_t n_short_names;
1992 /* Renaming a variable may clear its short names, but we
1993 want to retain them, so we save them and re-set them
1995 n_short_names = var_get_short_name_cnt (var);
1996 short_names = xnmalloc (n_short_names, sizeof *short_names);
1997 for (i = 0; i < n_short_names; i++)
1999 const char *s = var_get_short_name (var, i);
2000 short_names[i] = s != NULL ? xstrdup (s) : NULL;
2003 /* Set long name. */
2004 dict_rename_var (dict, var, new_name);
2006 /* Restore short names. */
2007 for (i = 0; i < n_short_names; i++)
2009 var_set_short_name (var, i, short_names[i]);
2010 free (short_names[i]);
2015 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2016 to each short name. Modifies variable names in DICT accordingly. */
2018 parse_long_var_name_map (struct sfm_reader *r,
2019 const struct sfm_extension_record *record,
2020 struct dictionary *dict)
2022 struct text_record *text;
2023 struct variable *var;
2028 /* There are no long variable names. Use the short variable names,
2029 converted to lowercase, as the long variable names. */
2032 for (i = 0; i < dict_get_var_cnt (dict); i++)
2034 struct variable *var = dict_get_var (dict, i);
2037 new_name = utf8_to_lower (var_get_name (var));
2038 rename_var_and_save_short_names (dict, var, new_name);
2045 /* Rename each of the variables, one by one. (In a correctly constructed
2046 system file, this cannot create any intermediate duplicate variable names,
2047 because all of the new variable names are longer than any of the old
2048 variable names and thus there cannot be any overlaps.) */
2049 text = open_text_record (r, record, true);
2050 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2052 /* Validate long name. */
2053 if (!dict_id_is_valid (dict, long_name, false))
2055 sys_warn (r, record->pos,
2056 _("Long variable mapping from %s to invalid "
2057 "variable name `%s'."),
2058 var_get_name (var), long_name);
2062 /* Identify any duplicates. */
2063 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
2064 && dict_lookup_var (dict, long_name) != NULL)
2066 sys_warn (r, record->pos,
2067 _("Duplicate long variable name `%s'."), long_name);
2071 rename_var_and_save_short_names (dict, var, long_name);
2073 close_text_record (r, text);
2076 /* Reads record type 7, subtype 14, which gives the real length
2077 of each very long string. Rearranges DICT accordingly. */
2079 parse_long_string_map (struct sfm_reader *r,
2080 const struct sfm_extension_record *record,
2081 struct dictionary *dict)
2083 struct text_record *text;
2084 struct variable *var;
2087 text = open_text_record (r, record, true);
2088 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2090 size_t idx = var_get_dict_index (var);
2096 length = strtol (length_s, NULL, 10);
2097 if (length < 1 || length > MAX_STRING)
2099 sys_warn (r, record->pos,
2100 _("%s listed as string of invalid length %s "
2101 "in very long string record."),
2102 var_get_name (var), length_s);
2106 /* Check segments. */
2107 segment_cnt = sfm_width_to_segments (length);
2108 if (segment_cnt == 1)
2110 sys_warn (r, record->pos,
2111 _("%s listed in very long string record with width %s, "
2112 "which requires only one segment."),
2113 var_get_name (var), length_s);
2116 if (idx + segment_cnt > dict_get_var_cnt (dict))
2118 sys_error (r, record->pos,
2119 _("Very long string %s overflows dictionary."),
2120 var_get_name (var));
2124 /* Get the short names from the segments and check their
2126 for (i = 0; i < segment_cnt; i++)
2128 struct variable *seg = dict_get_var (dict, idx + i);
2129 int alloc_width = sfm_segment_alloc_width (length, i);
2130 int width = var_get_width (seg);
2133 var_set_short_name (var, i, var_get_short_name (seg, 0));
2134 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2136 sys_error (r, record->pos,
2137 _("Very long string with width %ld has segment %d "
2138 "of width %d (expected %d)."),
2139 length, i, width, alloc_width);
2143 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
2144 var_set_width (var, length);
2146 close_text_record (r, text);
2147 dict_compact_values (dict);
2153 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
2154 const struct sfm_var_record *var_recs, size_t n_var_recs,
2155 const struct sfm_value_label_record *record)
2157 struct variable **vars;
2161 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2162 for (i = 0; i < record->n_labels; i++)
2163 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2164 record->labels[i].label, -1,
2167 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
2168 for (i = 0; i < record->n_vars; i++)
2170 vars[i] = lookup_var_by_index (r, record->pos,
2171 var_recs, n_var_recs, record->vars[i]);
2172 if (vars[i] == NULL)
2176 for (i = 1; i < record->n_vars; i++)
2177 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2179 sys_error (r, record->pos,
2180 _("Variables associated with value label are not all of "
2181 "identical type. Variable %s is %s, but variable "
2183 var_get_name (vars[0]),
2184 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2185 var_get_name (vars[i]),
2186 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2190 for (i = 0; i < record->n_vars; i++)
2192 struct variable *var = vars[i];
2196 width = var_get_width (var);
2199 sys_error (r, record->pos,
2200 _("Value labels may not be added to long string "
2201 "variables (e.g. %s) using records types 3 and 4."),
2202 var_get_name (var));
2206 for (j = 0; j < record->n_labels; j++)
2208 struct sfm_value_label *label = &record->labels[j];
2211 value_init (&value, width);
2213 value.f = parse_float (r, label->value, 0);
2215 memcpy (value_str_rw (&value, width), label->value, width);
2217 if (!var_add_value_label (var, &value, utf8_labels[j]))
2219 if (var_is_numeric (var))
2220 sys_warn (r, record->pos,
2221 _("Duplicate value label for %g on %s."),
2222 value.f, var_get_name (var));
2224 sys_warn (r, record->pos,
2225 _("Duplicate value label for `%.*s' on %s."),
2226 width, value_str (&value, width),
2227 var_get_name (var));
2230 value_destroy (&value, width);
2234 pool_free (r->pool, vars);
2235 for (i = 0; i < record->n_labels; i++)
2236 pool_free (r->pool, utf8_labels[i]);
2237 pool_free (r->pool, utf8_labels);
2242 static struct variable *
2243 lookup_var_by_index (struct sfm_reader *r, off_t offset,
2244 const struct sfm_var_record *var_recs, size_t n_var_recs,
2247 const struct sfm_var_record *rec;
2249 if (idx < 1 || idx > n_var_recs)
2251 sys_error (r, offset,
2252 _("Variable index %d not in valid range 1...%zu."),
2257 rec = &var_recs[idx - 1];
2258 if (rec->var == NULL)
2260 sys_error (r, offset,
2261 _("Variable index %d refers to long string continuation."),
2269 /* Parses a set of custom attributes from TEXT into ATTRS.
2270 ATTRS may be a null pointer, in which case the attributes are
2271 read but discarded. */
2273 parse_attributes (struct sfm_reader *r, struct text_record *text,
2274 struct attrset *attrs)
2278 struct attribute *attr;
2282 /* Parse the key. */
2283 key = text_get_token (text, ss_cstr ("("), NULL);
2287 attr = attribute_create (key);
2288 for (index = 1; ; index++)
2290 /* Parse the value. */
2294 value = text_get_token (text, ss_cstr ("\n"), NULL);
2297 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2302 length = strlen (value);
2303 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2305 value[length - 1] = '\0';
2306 attribute_add_value (attr, value + 1);
2311 _("Attribute value %s[%d] is not quoted: %s."),
2313 attribute_add_value (attr, value);
2316 /* Was this the last value for this attribute? */
2317 if (text_match (text, ')'))
2321 attrset_add (attrs, attr);
2323 attribute_destroy (attr);
2325 while (!text_match (text, '/'));
2328 /* Reads record type 7, subtype 17, which lists custom
2329 attributes on the data file. */
2331 parse_data_file_attributes (struct sfm_reader *r,
2332 const struct sfm_extension_record *record,
2333 struct dictionary *dict)
2335 struct text_record *text = open_text_record (r, record, true);
2336 parse_attributes (r, text, dict_get_attributes (dict));
2337 close_text_record (r, text);
2340 /* Parses record type 7, subtype 18, which lists custom
2341 attributes on individual variables. */
2343 parse_variable_attributes (struct sfm_reader *r,
2344 const struct sfm_extension_record *record,
2345 struct dictionary *dict)
2347 struct text_record *text;
2348 struct variable *var;
2350 text = open_text_record (r, record, true);
2351 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2352 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2353 close_text_record (r, text);
2357 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2359 size_t n_warnings = 0;
2362 for (i = 0; i < dict_get_var_cnt (dict); i++)
2364 struct variable *var = dict_get_var (dict, i);
2365 struct attrset *attrs = var_get_attributes (var);
2366 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2369 int value = atoi (attribute_get_value (attr, 0));
2391 role = ROLE_PARTITION;
2400 if (n_warnings++ == 0)
2401 sys_warn (r, -1, _("Invalid role for variable %s."),
2402 var_get_name (var));
2405 var_set_role (var, role);
2410 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2415 check_overflow (struct sfm_reader *r,
2416 const struct sfm_extension_record *record,
2417 size_t ofs, size_t length)
2419 size_t end = record->size * record->count;
2420 if (length >= end || ofs + length > end)
2422 sys_error (r, record->pos + end,
2423 _("Extension record subtype %d ends unexpectedly."),
2431 parse_long_string_value_labels (struct sfm_reader *r,
2432 const struct sfm_extension_record *record,
2433 struct dictionary *dict)
2435 const char *dict_encoding = dict_get_encoding (dict);
2436 size_t end = record->size * record->count;
2443 struct variable *var;
2448 /* Parse variable name length. */
2449 if (!check_overflow (r, record, ofs, 4))
2451 var_name_len = parse_int (r, record->data, ofs);
2454 /* Parse variable name, width, and number of labels. */
2455 if (!check_overflow (r, record, ofs, var_name_len + 8))
2457 var_name = recode_string_pool ("UTF-8", dict_encoding,
2458 (const char *) record->data + ofs,
2459 var_name_len, r->pool);
2460 width = parse_int (r, record->data, ofs + var_name_len);
2461 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2462 ofs += var_name_len + 8;
2464 /* Look up 'var' and validate. */
2465 var = dict_lookup_var (dict, var_name);
2467 sys_warn (r, record->pos + ofs,
2468 _("Ignoring long string value label record for "
2469 "unknown variable %s."), var_name);
2470 else if (var_is_numeric (var))
2472 sys_warn (r, record->pos + ofs,
2473 _("Ignoring long string value label record for "
2474 "numeric variable %s."), var_name);
2477 else if (width != var_get_width (var))
2479 sys_warn (r, record->pos + ofs,
2480 _("Ignoring long string value label record for variable "
2481 "%s because the record's width (%d) does not match the "
2482 "variable's width (%d)."),
2483 var_name, width, var_get_width (var));
2488 value_init_pool (r->pool, &value, width);
2489 for (i = 0; i < n_labels; i++)
2491 size_t value_length, label_length;
2492 bool skip = var == NULL;
2494 /* Parse value length. */
2495 if (!check_overflow (r, record, ofs, 4))
2497 value_length = parse_int (r, record->data, ofs);
2501 if (!check_overflow (r, record, ofs, value_length))
2505 if (value_length == width)
2506 memcpy (value_str_rw (&value, width),
2507 (const uint8_t *) record->data + ofs, width);
2510 sys_warn (r, record->pos + ofs,
2511 _("Ignoring long string value label %zu for "
2512 "variable %s, with width %d, that has bad value "
2514 i, var_get_name (var), width, value_length);
2518 ofs += value_length;
2520 /* Parse label length. */
2521 if (!check_overflow (r, record, ofs, 4))
2523 label_length = parse_int (r, record->data, ofs);
2527 if (!check_overflow (r, record, ofs, label_length))
2533 label = recode_string_pool ("UTF-8", dict_encoding,
2534 (const char *) record->data + ofs,
2535 label_length, r->pool);
2536 if (!var_add_value_label (var, &value, label))
2537 sys_warn (r, record->pos + ofs,
2538 _("Duplicate value label for `%.*s' on %s."),
2539 width, value_str (&value, width),
2540 var_get_name (var));
2541 pool_free (r->pool, label);
2543 ofs += label_length;
2551 parse_long_string_missing_values (struct sfm_reader *r,
2552 const struct sfm_extension_record *record,
2553 struct dictionary *dict)
2555 const char *dict_encoding = dict_get_encoding (dict);
2556 size_t end = record->size * record->count;
2561 struct missing_values mv;
2563 struct variable *var;
2564 int n_missing_values;
2568 /* Parse variable name length. */
2569 if (!check_overflow (r, record, ofs, 4))
2571 var_name_len = parse_int (r, record->data, ofs);
2574 /* Parse variable name. */
2575 if (!check_overflow (r, record, ofs, var_name_len + 1))
2577 var_name = recode_string_pool ("UTF-8", dict_encoding,
2578 (const char *) record->data + ofs,
2579 var_name_len, r->pool);
2580 ofs += var_name_len;
2582 /* Parse number of missing values. */
2583 n_missing_values = ((const uint8_t *) record->data)[ofs];
2584 if (n_missing_values < 1 || n_missing_values > 3)
2585 sys_warn (r, record->pos + ofs,
2586 _("Long string missing values record says variable %s "
2587 "has %d missing values, but only 1 to 3 missing values "
2589 var_name, n_missing_values);
2592 /* Look up 'var' and validate. */
2593 var = dict_lookup_var (dict, var_name);
2595 sys_warn (r, record->pos + ofs,
2596 _("Ignoring long string missing value record for "
2597 "unknown variable %s."), var_name);
2598 else if (var_is_numeric (var))
2600 sys_warn (r, record->pos + ofs,
2601 _("Ignoring long string missing value record for "
2602 "numeric variable %s."), var_name);
2607 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2608 for (i = 0; i < n_missing_values; i++)
2610 size_t value_length;
2612 /* Parse value length. */
2613 if (!check_overflow (r, record, ofs, 4))
2615 value_length = parse_int (r, record->data, ofs);
2619 if (!check_overflow (r, record, ofs, value_length))
2623 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2625 sys_warn (r, record->pos + ofs,
2626 _("Ignoring long string missing value %zu for variable "
2627 "%s, with width %d, that has bad value width %zu."),
2628 i, var_get_name (var), var_get_width (var),
2630 ofs += value_length;
2633 var_set_missing_values (var, &mv);
2641 static void partial_record (struct sfm_reader *);
2643 static void read_error (struct casereader *, const struct sfm_reader *);
2645 static bool read_case_number (struct sfm_reader *, double *);
2646 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2647 static int read_opcode (struct sfm_reader *);
2648 static bool read_compressed_number (struct sfm_reader *, double *);
2649 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2650 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2651 static bool skip_whole_strings (struct sfm_reader *, size_t);
2653 /* Reads and returns one case from READER's file. Returns a null
2654 pointer if not successful. */
2655 static struct ccase *
2656 sys_file_casereader_read (struct casereader *reader, void *r_)
2658 struct sfm_reader *r = r_;
2666 c = case_create (r->proto);
2668 for (i = 0; i < r->sfm_var_cnt; i++)
2670 struct sfm_var *sv = &r->sfm_vars[i];
2671 union value *v = case_data_rw_idx (c, sv->case_index);
2673 if (sv->var_width == 0)
2674 retval = read_case_number (r, &v->f);
2677 uint8_t *s = value_str_rw (v, sv->var_width);
2678 retval = read_case_string (r, s + sv->offset, sv->segment_width);
2681 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2683 sys_error (r, r->pos, _("File ends in partial string value."));
2695 if (r->case_cnt != -1)
2696 read_error (reader, r);
2701 /* Issues an error that R ends in a partial record. */
2703 partial_record (struct sfm_reader *r)
2705 sys_error (r, r->pos, _("File ends in partial case."));
2708 /* Issues an error that an unspecified error occurred SFM, and
2711 read_error (struct casereader *r, const struct sfm_reader *sfm)
2713 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2714 casereader_force_error (r);
2717 /* Reads a number from R and stores its value in *D.
2718 If R is compressed, reads a compressed number;
2719 otherwise, reads a number in the regular way.
2720 Returns true if successful, false if end of file is
2721 reached immediately. */
2723 read_case_number (struct sfm_reader *r, double *d)
2725 if (r->compression == ANY_COMP_NONE)
2728 if (!try_read_bytes (r, number, sizeof number))
2730 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2734 return read_compressed_number (r, d);
2737 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2738 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2739 discarded without being written to S. Reads compressed strings if S is
2740 compressed. Returns 1 if successful, 0 if end of file is reached
2741 immediately, or -1 for some kind of error. */
2743 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2745 size_t whole = ROUND_DOWN (length, 8);
2746 size_t partial = length % 8;
2750 int retval = read_whole_strings (r, s, whole);
2758 int retval = read_whole_strings (r, bounce, sizeof bounce);
2770 memcpy (s + whole, bounce, partial);
2776 /* Reads and returns the next compression opcode from R. */
2778 read_opcode (struct sfm_reader *r)
2780 assert (r->compression != ANY_COMP_NONE);
2784 if (r->opcode_idx >= sizeof r->opcodes)
2787 int retval = try_read_compressed_bytes (r, r->opcodes,
2793 opcode = r->opcodes[r->opcode_idx++];
2800 /* Reads a compressed number from R and stores its value in D.
2801 Returns true if successful, false if end of file is
2802 reached immediately. */
2804 read_compressed_number (struct sfm_reader *r, double *d)
2806 int opcode = read_opcode (r);
2814 return read_compressed_float (r, d);
2817 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2818 if (!r->corruption_warning)
2820 r->corruption_warning = true;
2821 sys_warn (r, r->pos,
2822 _("Possible compressed data corruption: "
2823 "compressed spaces appear in numeric field."));
2832 *d = opcode - r->bias;
2839 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2841 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2846 opcode = read_opcode (r);
2854 retval = read_compressed_bytes (r, dst, 8);
2855 return retval == 1 ? 1 : -1;
2858 memset (dst, ' ', 8);
2863 double value = opcode - r->bias;
2864 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2867 /* This has actually been seen "in the wild". The submitter of the
2868 file that showed that the contents decoded as spaces, but they
2869 were at the end of the field so it's possible that the null
2870 bytes just acted as null terminators. */
2872 else if (!r->corruption_warning)
2874 r->corruption_warning = true;
2875 sys_warn (r, r->pos,
2876 _("Possible compressed data corruption: "
2877 "string contains compressed integer (opcode %d)."),
2885 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2886 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2887 end of file is reached immediately, or -1 for some kind of error. */
2889 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2891 assert (length % 8 == 0);
2892 if (r->compression == ANY_COMP_NONE)
2893 return try_read_bytes (r, s, length);
2898 for (ofs = 0; ofs < length; ofs += 8)
2900 int retval = read_compressed_string (r, s + ofs);
2915 /* Skips LENGTH string bytes from R.
2916 LENGTH must be a multiple of 8.
2917 (LENGTH is also limited to 1024, but that's only because the
2918 current caller never needs more than that many bytes.)
2919 Returns true if successful, false if end of file is
2920 reached immediately. */
2922 skip_whole_strings (struct sfm_reader *r, size_t length)
2924 uint8_t buffer[1024];
2925 assert (length < sizeof buffer);
2926 return read_whole_strings (r, buffer, length);
2929 /* Helpers for reading records that contain structured text
2932 /* Maximum number of warnings to issue for a single text
2934 #define MAX_TEXT_WARNINGS 5
2939 struct substring buffer; /* Record contents. */
2940 off_t start; /* Starting offset in file. */
2941 size_t pos; /* Current position in buffer. */
2942 int n_warnings; /* Number of warnings issued or suppressed. */
2943 bool recoded; /* Recoded into UTF-8? */
2946 static struct text_record *
2947 open_text_record (struct sfm_reader *r,
2948 const struct sfm_extension_record *record,
2949 bool recode_to_utf8)
2951 struct text_record *text;
2952 struct substring raw;
2954 text = pool_alloc (r->pool, sizeof *text);
2955 raw = ss_buffer (record->data, record->size * record->count);
2956 text->start = record->pos;
2957 text->buffer = (recode_to_utf8
2958 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2961 text->n_warnings = 0;
2962 text->recoded = recode_to_utf8;
2967 /* Closes TEXT, frees its storage, and issues a final warning
2968 about suppressed warnings if necesary. */
2970 close_text_record (struct sfm_reader *r, struct text_record *text)
2972 if (text->n_warnings > MAX_TEXT_WARNINGS)
2973 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2974 text->n_warnings - MAX_TEXT_WARNINGS);
2976 pool_free (r->pool, ss_data (text->buffer));
2979 /* Reads a variable=value pair from TEXT.
2980 Looks up the variable in DICT and stores it into *VAR.
2981 Stores a null-terminated value into *VALUE. */
2983 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2984 struct text_record *text,
2985 struct variable **var, char **value)
2989 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2992 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2996 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2997 ss_buffer ("\t\0", 2));
3005 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
3006 struct text_record *text, struct substring delimiters,
3007 struct variable **var)
3011 name = text_get_token (text, delimiters, NULL);
3015 *var = dict_lookup_var (dict, name);
3019 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3026 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3027 struct text_record *text, struct substring delimiters,
3028 struct variable **var)
3030 char *short_name = text_get_token (text, delimiters, NULL);
3031 if (short_name == NULL)
3034 *var = dict_lookup_var (dict, short_name);
3036 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3041 /* Displays a warning for the current file position, limiting the
3042 number to MAX_TEXT_WARNINGS for TEXT. */
3044 text_warn (struct sfm_reader *r, struct text_record *text,
3045 const char *format, ...)
3047 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3051 va_start (args, format);
3052 sys_msg (r, text->start + text->pos, MW, format, args);
3058 text_get_token (struct text_record *text, struct substring delimiters,
3061 struct substring token;
3064 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3067 end = &ss_data (token)[ss_length (token)];
3068 if (delimiter != NULL)
3071 return ss_data (token);
3074 /* Reads a integer value expressed in decimal, then a space, then a string that
3075 consists of exactly as many bytes as specified by the integer, then a space,
3076 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3077 buffer (so the caller should not free the string). */
3079 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3087 while (text->pos < text->buffer.length)
3089 int c = text->buffer.string[text->pos];
3090 if (c < '0' || c > '9')
3092 n = (n * 10) + (c - '0');
3095 if (text->pos >= text->buffer.length || start == text->pos)
3097 sys_warn (r, text->start,
3098 _("Expecting digit at offset %zu in MRSETS record."),
3103 if (!text_match (text, ' '))
3105 sys_warn (r, text->start,
3106 _("Expecting space at offset %zu in MRSETS record."),
3111 if (text->pos + n > text->buffer.length)
3113 sys_warn (r, text->start,
3114 _("%zu-byte string starting at offset %zu "
3115 "exceeds record length %zu."),
3116 n, text->pos, text->buffer.length);
3120 s = &text->buffer.string[text->pos];
3123 sys_warn (r, text->start,
3124 _("Expecting space at offset %zu following %zu-byte string."),
3134 text_match (struct text_record *text, char c)
3136 if (text->buffer.string[text->pos] == c)
3145 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3146 inside the TEXT's string. */
3148 text_pos (const struct text_record *text)
3154 text_get_all (const struct text_record *text)
3156 return text->buffer.string;
3161 /* Displays a corruption message. */
3163 sys_msg (struct sfm_reader *r, off_t offset,
3164 int class, const char *format, va_list args)
3169 ds_init_empty (&text);
3171 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3172 fh_get_file_name (r->fh), (long long int) offset);
3174 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3175 ds_put_vformat (&text, format, args);
3177 m.category = msg_class_to_category (class);
3178 m.severity = msg_class_to_severity (class);
3184 m.text = ds_cstr (&text);
3189 /* Displays a warning for offset OFFSET in the file. */
3191 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3195 va_start (args, format);
3196 sys_msg (r, offset, MW, format, args);
3200 /* Displays an error for the current file position and marks it as in an error
3203 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3207 va_start (args, format);
3208 sys_msg (r, offset, ME, format, args);
3214 /* Reads BYTE_CNT bytes into BUF.
3215 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3216 Returns -1 if an I/O error or a partial read occurs.
3217 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3220 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3221 void *buf, size_t byte_cnt)
3223 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3224 r->pos += bytes_read;
3225 if (bytes_read == byte_cnt)
3227 else if (ferror (r->file))
3229 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3232 else if (!eof_is_ok || bytes_read != 0)
3234 sys_error (r, r->pos, _("Unexpected end of file."));
3241 /* Reads BYTE_CNT into BUF.
3242 Returns true if successful.
3243 Returns false upon I/O error or if end-of-file is encountered. */
3245 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3247 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3250 /* Reads BYTE_CNT bytes into BUF.
3251 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3252 Returns 0 if an immediate end-of-file is encountered.
3253 Returns -1 if an I/O error or a partial read occurs. */
3255 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3257 return read_bytes_internal (r, true, buf, byte_cnt);
3260 /* Reads a 32-bit signed integer from R and stores its value in host format in
3261 *X. Returns true if successful, otherwise false. */
3263 read_int (struct sfm_reader *r, int *x)
3266 if (read_bytes (r, integer, sizeof integer) != 1)
3268 *x = integer_get (r->integer_format, integer, sizeof integer);
3273 read_uint (struct sfm_reader *r, unsigned int *x)
3278 ok = read_int (r, &y);
3283 /* Reads a 64-bit signed integer from R and returns its value in
3286 read_int64 (struct sfm_reader *r, long long int *x)
3289 if (read_bytes (r, integer, sizeof integer) != 1)
3291 *x = integer_get (r->integer_format, integer, sizeof integer);
3295 /* Reads a 64-bit signed integer from R and returns its value in
3298 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3303 ok = read_int64 (r, &y);
3309 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3311 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3315 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3317 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3320 /* Reads exactly SIZE - 1 bytes into BUFFER
3321 and stores a null byte into BUFFER[SIZE - 1]. */
3323 read_string (struct sfm_reader *r, char *buffer, size_t size)
3328 ok = read_bytes (r, buffer, size - 1);
3330 buffer[size - 1] = '\0';
3334 /* Skips BYTES bytes forward in R. */
3336 skip_bytes (struct sfm_reader *r, size_t bytes)
3341 size_t chunk = MIN (sizeof buffer, bytes);
3342 if (!read_bytes (r, buffer, chunk))
3350 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3351 been replaced by LFs.
3353 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3354 files that use CR-only line ends in the file label and extra product
3357 fix_line_ends (const char *s)
3361 d = dst = xmalloc (strlen (s) + 1);
3380 read_ztrailer (struct sfm_reader *r,
3381 long long int zheader_ofs,
3382 long long int ztrailer_len);
3385 zalloc (voidpf pool_, uInt items, uInt size)
3387 struct pool *pool = pool_;
3389 return (!size || xalloc_oversized (items, size)
3391 : pool_malloc (pool, items * size));
3395 zfree (voidpf pool_, voidpf address)
3397 struct pool *pool = pool_;
3399 pool_free (pool, address);
3403 read_zheader (struct sfm_reader *r)
3406 long long int zheader_ofs;
3407 long long int ztrailer_ofs;
3408 long long int ztrailer_len;
3410 if (!read_int64 (r, &zheader_ofs)
3411 || !read_int64 (r, &ztrailer_ofs)
3412 || !read_int64 (r, &ztrailer_len))
3415 if (zheader_ofs != pos)
3417 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3418 "(expected %#llx)."),
3419 zheader_ofs, (long long int) pos);
3423 if (ztrailer_ofs < r->pos)
3425 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3430 if (ztrailer_len < 24 || ztrailer_len % 24)
3432 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3436 r->ztrailer_ofs = ztrailer_ofs;
3437 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3440 if (r->zin_buf == NULL)
3442 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3443 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3444 r->zstream.next_in = NULL;
3445 r->zstream.avail_in = 0;
3448 r->zstream.zalloc = zalloc;
3449 r->zstream.zfree = zfree;
3450 r->zstream.opaque = r->pool;
3452 return open_zstream (r);
3456 seek (struct sfm_reader *r, off_t offset)
3458 if (fseeko (r->file, offset, SEEK_SET))
3459 sys_error (r, 0, _("%s: seek failed (%s)."),
3460 fh_get_file_name (r->fh), strerror (errno));
3464 /* Performs some additional consistency checks on the ZLIB compressed data
3467 read_ztrailer (struct sfm_reader *r,
3468 long long int zheader_ofs,
3469 long long int ztrailer_len)
3471 long long int expected_uncmp_ofs;
3472 long long int expected_cmp_ofs;
3475 unsigned int block_size;
3476 unsigned int n_blocks;
3480 if (fstat (fileno (r->file), &s))
3482 sys_error (ME, 0, _("%s: stat failed (%s)."),
3483 fh_get_file_name (r->fh), strerror (errno));
3487 if (!S_ISREG (s.st_mode))
3489 /* We can't seek to the trailer and then back to the data in this file,
3490 so skip doing extra checks. */
3494 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3495 sys_warn (r, r->pos,
3496 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3497 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3499 seek (r, r->ztrailer_ofs);
3501 /* Read fixed header from ZLIB data trailer. */
3502 if (!read_int64 (r, &bias))
3504 if (-bias != r->bias)
3506 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3507 "file header bias (%.2f)."),
3512 if (!read_int64 (r, &zero))
3515 sys_warn (r, r->pos,
3516 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3518 if (!read_uint (r, &block_size))
3520 if (block_size != ZBLOCK_SIZE)
3521 sys_warn (r, r->pos,
3522 _("ZLIB trailer specifies unexpected %u-byte block size."),
3525 if (!read_uint (r, &n_blocks))
3527 if (n_blocks != (ztrailer_len - 24) / 24)
3529 sys_error (r, r->pos,
3530 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3532 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3536 expected_uncmp_ofs = zheader_ofs;
3537 expected_cmp_ofs = zheader_ofs + 24;
3538 for (i = 0; i < n_blocks; i++)
3540 off_t desc_ofs = r->pos;
3541 unsigned long long int uncompressed_ofs;
3542 unsigned long long int compressed_ofs;
3543 unsigned int uncompressed_size;
3544 unsigned int compressed_size;
3546 if (!read_uint64 (r, &uncompressed_ofs)
3547 || !read_uint64 (r, &compressed_ofs)
3548 || !read_uint (r, &uncompressed_size)
3549 || !read_uint (r, &compressed_size))
3552 if (uncompressed_ofs != expected_uncmp_ofs)
3554 sys_error (r, desc_ofs,
3555 _("ZLIB block descriptor %u reported uncompressed data "
3556 "offset %#llx, when %#llx was expected."),
3557 i, uncompressed_ofs, expected_uncmp_ofs);
3561 if (compressed_ofs != expected_cmp_ofs)
3563 sys_error (r, desc_ofs,
3564 _("ZLIB block descriptor %u reported compressed data "
3565 "offset %#llx, when %#llx was expected."),
3566 i, compressed_ofs, expected_cmp_ofs);
3570 if (i < n_blocks - 1)
3572 if (uncompressed_size != block_size)
3573 sys_warn (r, desc_ofs,
3574 _("ZLIB block descriptor %u reported block size %#x, "
3575 "when %#x was expected."),
3576 i, uncompressed_size, block_size);
3580 if (uncompressed_size > block_size)
3581 sys_warn (r, desc_ofs,
3582 _("ZLIB block descriptor %u reported block size %#x, "
3583 "when at most %#x was expected."),
3584 i, uncompressed_size, block_size);
3587 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3588 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3589 This code checks for an expansion of more than 14.3% plus 11
3591 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3593 sys_error (r, desc_ofs,
3594 _("ZLIB block descriptor %u reports compressed size %u "
3595 "and uncompressed size %u."),
3596 i, compressed_size, uncompressed_size);
3600 expected_uncmp_ofs += uncompressed_size;
3601 expected_cmp_ofs += compressed_size;
3604 if (expected_cmp_ofs != r->ztrailer_ofs)
3606 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3607 "would be expected from block descriptors."),
3608 r->ztrailer_ofs, expected_cmp_ofs);
3612 seek (r, zheader_ofs + 24);
3617 open_zstream (struct sfm_reader *r)
3621 r->zout_pos = r->zout_end = 0;
3622 error = inflateInit (&r->zstream);
3625 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3633 close_zstream (struct sfm_reader *r)
3637 error = inflateEnd (&r->zstream);
3640 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3648 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3650 uint8_t *buf = buf_;
3659 /* Use already inflated data if there is any. */
3660 if (r->zout_pos < r->zout_end)
3662 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3663 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3672 /* We need to inflate some more data.
3673 Get some more input data if we don't have any. */
3674 if (r->zstream.avail_in == 0)
3676 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3681 int retval = try_read_bytes (r, r->zin_buf, n);
3684 r->zstream.avail_in = n;
3685 r->zstream.next_in = r->zin_buf;
3689 /* Inflate the (remaining) input data. */
3690 r->zstream.avail_out = ZOUT_BUF_SIZE;
3691 r->zstream.next_out = r->zout_buf;
3692 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3694 r->zout_end = r->zstream.next_out - r->zout_buf;
3695 if (r->zout_end == 0)
3697 if (error != Z_STREAM_END)
3699 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3703 else if (!close_zstream (r) || !open_zstream (r))
3708 /* Process the output data and ignore 'error' for now. ZLIB will
3709 present it to us again on the next inflate() call. */
3715 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3717 if (r->compression == ANY_COMP_SIMPLE)
3718 return read_bytes (r, buf, byte_cnt);
3721 int retval = read_bytes_zlib (r, buf, byte_cnt);
3723 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3729 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3731 if (r->compression == ANY_COMP_SIMPLE)
3732 return try_read_bytes (r, buf, byte_cnt);
3734 return read_bytes_zlib (r, buf, byte_cnt);
3737 /* Reads a 64-bit floating-point number from R and returns its
3738 value in host format. */
3740 read_compressed_float (struct sfm_reader *r, double *d)
3744 if (!read_compressed_bytes (r, number, sizeof number))
3747 *d = float_get_double (r->float_format, number);
3751 static const struct casereader_class sys_file_casereader_class =
3753 sys_file_casereader_read,
3754 sys_file_casereader_destroy,
3759 const struct any_reader_class sys_file_reader_class =
3761 N_("SPSS System File"),