1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/sys-file-reader.h"
20 #include "data/sys-file-private.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/message.h"
49 #include "libpspp/misc.h"
50 #include "libpspp/pool.h"
51 #include "libpspp/str.h"
52 #include "libpspp/stringi-set.h"
54 #include "gl/c-strtod.h"
55 #include "gl/c-ctype.h"
56 #include "gl/inttostr.h"
57 #include "gl/localcharset.h"
58 #include "gl/minmax.h"
59 #include "gl/unlocked-io.h"
60 #include "gl/xalloc.h"
61 #include "gl/xalloc-oversized.h"
65 #define _(msgid) gettext (msgid)
66 #define N_(msgid) (msgid)
70 /* subtypes 0-2 unknown */
71 EXT_INTEGER = 3, /* Machine integer info. */
72 EXT_FLOAT = 4, /* Machine floating-point info. */
73 EXT_VAR_SETS = 5, /* Variable sets. */
74 EXT_DATE = 6, /* DATE. */
75 EXT_MRSETS = 7, /* Multiple response sets. */
76 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
77 /* subtype 9 unknown */
78 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
79 EXT_DISPLAY = 11, /* Variable display parameters. */
80 /* subtype 12 unknown */
81 EXT_LONG_NAMES = 13, /* Long variable names. */
82 EXT_LONG_STRINGS = 14, /* Long strings. */
83 /* subtype 15 unknown */
84 EXT_NCASES = 16, /* Extended number of cases. */
85 EXT_FILE_ATTRS = 17, /* Data file attributes. */
86 EXT_VAR_ATTRS = 18, /* Variable attributes. */
87 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
88 EXT_ENCODING = 20, /* Character encoding. */
89 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
90 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
91 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
94 /* Fields from the top-level header record. */
95 struct sfm_header_record
97 char magic[5]; /* First 4 bytes of file, then null. */
98 int weight_idx; /* 0 if unweighted, otherwise a var index. */
99 int nominal_case_size; /* Number of var positions. */
101 /* These correspond to the members of struct sfm_file_info or a dictionary
102 but in the system file's encoding rather than ASCII. */
103 char creation_date[10]; /* "dd mmm yy". */
104 char creation_time[9]; /* "hh:mm:ss". */
105 char eye_catcher[61]; /* Eye-catcher string, then product name. */
106 char file_label[65]; /* File label. */
109 struct sfm_var_record
116 int missing_value_code;
119 struct variable *var;
122 struct sfm_value_label
128 struct sfm_value_label_record
131 struct sfm_value_label *labels;
132 unsigned int n_labels;
138 struct sfm_document_record
147 const char *name; /* Name. */
148 const char *label; /* Human-readable label for group. */
149 enum mrset_type type; /* Group type. */
150 const char **vars; /* Constituent variables' names. */
151 size_t n_vars; /* Number of constituent variables. */
154 enum mrset_md_cat_source cat_source; /* Source of category labels. */
155 bool label_from_var_label; /* 'label' taken from variable label? */
156 const char *counted; /* Counted value, as string. */
159 struct sfm_extension_record
161 int subtype; /* Record subtype. */
162 off_t pos; /* Starting offset in file. */
163 unsigned int size; /* Size of data elements. */
164 unsigned int count; /* Number of data elements. */
165 void *data; /* Contents. */
168 /* System file reader. */
171 /* Resource tracking. */
172 struct pool *pool; /* All system file state. */
175 struct sfm_read_info info;
176 struct sfm_header_record header;
177 struct sfm_var_record *vars;
179 struct sfm_value_label_record *labels;
181 struct sfm_document_record *document;
182 struct sfm_mrset *mrsets;
184 struct sfm_extension_record *extensions[32];
187 struct file_handle *fh; /* File handle. */
188 struct fh_lock *lock; /* Mutual exclusion for file handle. */
189 FILE *file; /* File stream. */
190 off_t pos; /* Position in file. */
191 bool error; /* I/O or corruption error? */
192 struct caseproto *proto; /* Format of output cases. */
195 enum integer_format integer_format; /* On-disk integer format. */
196 enum float_format float_format; /* On-disk floating point format. */
197 struct sfm_var *sfm_vars; /* Variables. */
198 size_t sfm_var_cnt; /* Number of variables. */
199 int case_cnt; /* Number of cases */
200 const char *encoding; /* String encoding. */
203 enum sfm_compression compression;
204 double bias; /* Compression bias, usually 100.0. */
205 uint8_t opcodes[8]; /* Current block of opcodes. */
206 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
207 bool corruption_warning; /* Warned about possible corruption? */
209 /* ZLIB decompression. */
210 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
211 #define ZIN_BUF_SIZE 4096
212 uint8_t *zin_buf; /* Inflation input buffer. */
213 #define ZOUT_BUF_SIZE 16384
214 uint8_t *zout_buf; /* Inflation output buffer. */
215 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
216 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
217 z_stream zstream; /* ZLIB inflater. */
220 static const struct casereader_class sys_file_casereader_class;
222 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
223 const struct sfm_var_record *,
226 static void sys_msg (struct sfm_reader *r, off_t, int class,
227 const char *format, va_list args)
228 PRINTF_FORMAT (4, 0);
229 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
230 PRINTF_FORMAT (3, 4);
231 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
232 PRINTF_FORMAT (3, 4);
234 static bool read_bytes (struct sfm_reader *, void *, size_t)
236 static int try_read_bytes (struct sfm_reader *, void *, size_t)
238 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
239 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
240 static bool read_int64 (struct sfm_reader *, long long int *)
242 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
244 static bool read_string (struct sfm_reader *, char *, size_t)
246 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
248 /* ZLIB compressed data handling. */
249 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
250 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
251 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
252 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
254 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
256 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
258 static bool read_compressed_float (struct sfm_reader *, double *)
261 static char *fix_line_ends (const char *);
263 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
264 static double parse_float (const struct sfm_reader *,
265 const void *data, size_t ofs);
267 static bool read_variable_record (struct sfm_reader *,
268 struct sfm_var_record *);
269 static bool read_value_label_record (struct sfm_reader *,
270 struct sfm_value_label_record *);
271 static struct sfm_document_record *read_document_record (struct sfm_reader *);
272 static bool read_extension_record (struct sfm_reader *, int subtype,
273 struct sfm_extension_record **);
274 static bool skip_extension_record (struct sfm_reader *, int subtype);
276 static struct text_record *open_text_record (
277 struct sfm_reader *, const struct sfm_extension_record *,
278 bool recode_to_utf8);
279 static void close_text_record (struct sfm_reader *,
280 struct text_record *);
281 static bool read_variable_to_value_pair (struct sfm_reader *,
283 struct text_record *,
284 struct variable **var, char **value);
285 static void text_warn (struct sfm_reader *r, struct text_record *text,
286 const char *format, ...)
287 PRINTF_FORMAT (3, 4);
288 static char *text_get_token (struct text_record *,
289 struct substring delimiters, char *delimiter);
290 static bool text_match (struct text_record *, char c);
291 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
292 struct text_record *,
293 struct substring delimiters,
295 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
296 struct text_record *,
297 struct substring delimiters,
299 static const char *text_parse_counted_string (struct sfm_reader *,
300 struct text_record *);
301 static size_t text_pos (const struct text_record *);
302 static const char *text_get_all (const struct text_record *);
304 /* Dictionary reader. */
312 static bool read_dictionary (struct sfm_reader *);
313 static bool read_record (struct sfm_reader *, int type,
314 size_t *allocated_vars, size_t *allocated_labels);
315 static bool read_header (struct sfm_reader *, struct sfm_read_info *,
316 struct sfm_header_record *);
317 static void parse_header (struct sfm_reader *,
318 const struct sfm_header_record *,
319 struct sfm_read_info *, struct dictionary *);
320 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
321 struct sfm_var_record *, size_t n);
322 static void parse_format_spec (struct sfm_reader *, off_t pos,
323 unsigned int format, enum which_format,
324 struct variable *, int *format_warning_cnt);
325 static void parse_document (struct dictionary *, struct sfm_document_record *);
326 static void parse_display_parameters (struct sfm_reader *,
327 const struct sfm_extension_record *,
328 struct dictionary *);
329 static bool parse_machine_integer_info (struct sfm_reader *,
330 const struct sfm_extension_record *,
331 struct sfm_read_info *);
332 static void parse_machine_float_info (struct sfm_reader *,
333 const struct sfm_extension_record *);
334 static void parse_extra_product_info (struct sfm_reader *,
335 const struct sfm_extension_record *,
336 struct sfm_read_info *);
337 static void parse_mrsets (struct sfm_reader *,
338 const struct sfm_extension_record *,
339 size_t *allocated_mrsets);
340 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
341 static void parse_long_var_name_map (struct sfm_reader *,
342 const struct sfm_extension_record *,
343 struct dictionary *);
344 static bool parse_long_string_map (struct sfm_reader *,
345 const struct sfm_extension_record *,
346 struct dictionary *);
347 static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
348 const struct sfm_var_record *,
350 const struct sfm_value_label_record *);
351 static void parse_data_file_attributes (struct sfm_reader *,
352 const struct sfm_extension_record *,
353 struct dictionary *);
354 static void parse_variable_attributes (struct sfm_reader *,
355 const struct sfm_extension_record *,
356 struct dictionary *);
357 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
358 static bool parse_long_string_value_labels (struct sfm_reader *,
359 const struct sfm_extension_record *,
360 struct dictionary *);
361 static bool parse_long_string_missing_values (
362 struct sfm_reader *, const struct sfm_extension_record *,
363 struct dictionary *);
365 /* Frees the strings inside INFO. */
367 sfm_read_info_destroy (struct sfm_read_info *info)
371 free (info->creation_date);
372 free (info->creation_time);
373 free (info->product);
374 free (info->product_ext);
378 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
379 successful, otherwise NULL. */
381 sfm_open (struct file_handle *fh)
383 size_t allocated_mrsets = 0;
384 struct sfm_reader *r;
386 /* Create and initialize reader. */
387 r = xzalloc (sizeof *r);
388 r->pool = pool_create ();
389 pool_register (r->pool, free, r);
391 r->opcode_idx = sizeof r->opcodes;
393 /* TRANSLATORS: this fragment will be interpolated into
394 messages in fh_lock() that identify types of files. */
395 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
399 r->file = fn_open (fh_get_file_name (fh), "rb");
402 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
403 fh_get_file_name (r->fh), strerror (errno));
407 if (!read_dictionary (r))
410 if (r->extensions[EXT_MRSETS] != NULL)
411 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
413 if (r->extensions[EXT_MRSETS2] != NULL)
414 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
423 read_dictionary (struct sfm_reader *r)
425 size_t allocated_vars;
426 size_t allocated_labels;
428 if (!read_header (r, &r->info, &r->header))
432 allocated_labels = 0;
437 if (!read_int (r, &type))
441 if (!read_record (r, type, &allocated_vars, &allocated_labels))
445 if (!skip_bytes (r, 4))
448 if (r->compression == SFM_COMP_ZLIB && !read_zheader (r))
455 read_record (struct sfm_reader *r, int type,
456 size_t *allocated_vars, size_t *allocated_labels)
463 if (r->n_vars >= *allocated_vars)
464 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
466 return read_variable_record (r, &r->vars[r->n_vars++]);
469 if (r->n_labels >= *allocated_labels)
470 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
472 return read_value_label_record (r, &r->labels[r->n_labels++]);
475 /* A Type 4 record is always immediately after a type 3 record,
476 so the code for type 3 records reads the type 4 record too. */
477 sys_error (r, r->pos, _("Misplaced type 4 record."));
481 if (r->document != NULL)
483 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
486 r->document = read_document_record (r);
487 return r->document != NULL;
490 if (!read_int (r, &subtype))
493 || subtype >= sizeof r->extensions / sizeof *r->extensions)
496 _("Unrecognized record type 7, subtype %d. Please "
497 "send a copy of this file, and the syntax which "
498 "created it to %s."),
499 subtype, PACKAGE_BUGREPORT);
500 return skip_extension_record (r, subtype);
502 else if (r->extensions[subtype] != NULL)
505 _("Record type 7, subtype %d found here has the same "
506 "type as the record found near offset 0x%llx. "
507 "Please send a copy of this file, and the syntax "
508 "which created it to %s."),
509 subtype, (long long int) r->extensions[subtype]->pos,
511 return skip_extension_record (r, subtype);
514 return read_extension_record (r, subtype, &r->extensions[subtype]);
517 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
524 /* Returns the character encoding obtained from R, or a null pointer if R
525 doesn't have an indication of its character encoding. */
527 sfm_get_encoding (const struct sfm_reader *r)
529 /* The EXT_ENCODING record is the best way to determine dictionary
531 if (r->extensions[EXT_ENCODING])
532 return r->extensions[EXT_ENCODING]->data;
534 /* But EXT_INTEGER is better than nothing as a fallback. */
535 if (r->extensions[EXT_INTEGER])
537 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
538 const char *encoding;
547 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
548 respectively. However, many files have character code 2 but data
549 which are clearly not ASCII. Therefore, ignore these values. */
556 encoding = sys_get_encoding_from_codepage (codepage);
557 if (encoding != NULL)
563 /* If the file magic number is EBCDIC then its character data is too. */
564 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
570 struct get_strings_aux
581 add_string__ (struct get_strings_aux *aux,
582 const char *string, bool id, char *title)
584 if (aux->n >= aux->allocated)
586 aux->allocated = 2 * (aux->allocated + 1);
587 aux->titles = pool_realloc (aux->pool, aux->titles,
588 aux->allocated * sizeof *aux->titles);
589 aux->strings = pool_realloc (aux->pool, aux->strings,
590 aux->allocated * sizeof *aux->strings);
591 aux->ids = pool_realloc (aux->pool, aux->ids,
592 aux->allocated * sizeof *aux->ids);
595 aux->titles[aux->n] = title;
596 aux->strings[aux->n] = pool_strdup (aux->pool, string);
597 aux->ids[aux->n] = id;
601 static void PRINTF_FORMAT (3, 4)
602 add_string (struct get_strings_aux *aux,
603 const char *string, const char *title, ...)
607 va_start (args, title);
608 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
612 static void PRINTF_FORMAT (3, 4)
613 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
617 va_start (args, title);
618 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
622 /* Retrieves significant string data from R in its raw format, to allow the
623 caller to try to detect the encoding in use.
625 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
626 and *STRINGSP to an array of N elements allocated from POOL. For each I in
627 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
628 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
629 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
632 sfm_get_strings (const struct sfm_reader *r, struct pool *pool,
633 char ***titlesp, bool **idsp, char ***stringsp)
635 const struct sfm_mrset *mrset;
636 struct get_strings_aux aux;
648 for (i = 0; i < r->n_vars; i++)
649 if (r->vars[i].width != -1)
650 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
653 for (i = 0; i < r->n_vars; i++)
654 if (r->vars[i].width != -1)
657 if (r->vars[i].label)
658 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
663 for (i = 0; i < r->n_labels; i++)
664 for (j = 0; j < r->labels[i].n_labels; j++)
665 add_string (&aux, r->labels[i].labels[j].label,
666 _("Value Label %zu"), k++);
668 add_string (&aux, r->header.creation_date, _("Creation Date"));
669 add_string (&aux, r->header.creation_time, _("Creation Time"));
670 add_string (&aux, r->header.eye_catcher, _("Product"));
671 add_string (&aux, r->header.file_label, _("File Label"));
673 if (r->extensions[EXT_PRODUCT_INFO])
674 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
675 _("Extra Product Info"));
681 for (i = 0; i < r->document->n_lines; i++)
685 memcpy (line, r->document->documents + i * 80, 80);
688 add_string (&aux, line, _("Document Line %zu"), i + 1);
692 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
694 size_t mrset_idx = mrset - r->mrsets + 1;
696 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
698 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
700 /* Skip the variables because they ought to be duplicates. */
703 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
708 /* data file attributes */
709 /* variable attributes */
711 /* long string value labels */
712 /* long string missing values */
714 *titlesp = aux.titles;
716 *stringsp = aux.strings;
720 /* Decodes the dictionary read from R, saving it into into *DICT. Character
721 strings in R are decoded using ENCODING, or an encoding obtained from R if
722 ENCODING is null, or the locale encoding if R specifies no encoding.
724 If INFOP is non-null, then it receives additional info about the system
725 file, which the caller must eventually free with sfm_read_info_destroy()
726 when it is no longer needed.
728 This function consumes R. The caller must use it again later, even to
729 destroy it with sfm_close(). */
731 sfm_decode (struct sfm_reader *r, const char *encoding,
732 struct dictionary **dictp, struct sfm_read_info *infop)
734 struct dictionary *dict;
737 if (encoding == NULL)
739 encoding = sfm_get_encoding (r);
740 if (encoding == NULL)
742 sys_warn (r, -1, _("This system file does not indicate its own "
743 "character encoding. Using default encoding "
744 "%s. For best results, specify an encoding "
745 "explicitly. Use SYSFILE INFO with "
746 "ENCODING=\"DETECT\" to analyze the possible "
749 encoding = locale_charset ();
753 dict = dict_create (encoding);
754 r->encoding = dict_get_encoding (dict);
756 /* These records don't use variables at all. */
757 if (r->document != NULL)
758 parse_document (dict, r->document);
760 if (r->extensions[EXT_INTEGER] != NULL
761 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
764 if (r->extensions[EXT_FLOAT] != NULL)
765 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
767 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
768 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
770 if (r->extensions[EXT_FILE_ATTRS] != NULL)
771 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
773 parse_header (r, &r->header, &r->info, dict);
775 /* Parse the variable records, the basis of almost everything else. */
776 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
779 /* Parse value labels and the weight variable immediately after the variable
780 records. These records use indexes into var_recs[], so we must parse them
781 before those indexes become invalidated by very long string variables. */
782 for (i = 0; i < r->n_labels; i++)
783 if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i]))
785 if (r->header.weight_idx != 0)
787 struct variable *weight_var;
789 weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars,
790 r->header.weight_idx);
791 if (weight_var != NULL)
793 if (var_is_numeric (weight_var))
794 dict_set_weight (dict, weight_var);
796 sys_warn (r, -1, _("Ignoring string variable `%s' set "
797 "as weighting variable."),
798 var_get_name (weight_var));
802 if (r->extensions[EXT_DISPLAY] != NULL)
803 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
805 /* The following records use short names, so they need to be parsed before
806 parse_long_var_name_map() changes short names to long names. */
807 decode_mrsets (r, dict);
809 if (r->extensions[EXT_LONG_STRINGS] != NULL
810 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
813 /* Now rename variables to their long names. */
814 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
816 /* The following records use long names, so they need to follow renaming. */
817 if (r->extensions[EXT_VAR_ATTRS] != NULL)
819 parse_variable_attributes (r, r->extensions[EXT_VAR_ATTRS], dict);
821 /* Roles use the $@Role attribute. */
822 assign_variable_roles (r, dict);
825 if (r->extensions[EXT_LONG_LABELS] != NULL
826 && !parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS],
829 if (r->extensions[EXT_LONG_MISSING] != NULL
830 && !parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
834 /* Warn if the actual amount of data per case differs from the
835 amount that the header claims. SPSS version 13 gets this
836 wrong when very long strings are involved, so don't warn in
838 if (r->header.nominal_case_size != -1
839 && r->header.nominal_case_size != r->n_vars
840 && r->info.version_major != 13)
841 sys_warn (r, -1, _("File header claims %d variable positions but "
842 "%zu were read from file."),
843 r->header.nominal_case_size, r->n_vars);
845 /* Create an index of dictionary variable widths for
846 sfm_read_case to use. We cannot use the `struct variable's
847 from the dictionary we created, because the caller owns the
848 dictionary and may destroy or modify its variables. */
849 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
850 pool_register (r->pool, free, r->sfm_vars);
851 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
857 memset (&r->info, 0, sizeof r->info);
860 return casereader_create_sequential
862 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
863 &sys_file_casereader_class, r);
872 /* Closes R, which should have been returned by sfm_open() but not already
873 closed with sfm_decode() or this function.
874 Returns true if an I/O error has occurred on READER, false
877 sfm_close (struct sfm_reader *r)
886 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
888 msg (ME, _("Error closing system file `%s': %s."),
889 fh_get_file_name (r->fh), strerror (errno));
895 sfm_read_info_destroy (&r->info);
900 pool_destroy (r->pool);
905 /* Destroys READER. */
907 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
909 struct sfm_reader *r = r_;
913 /* Returns true if FILE is an SPSS system file,
916 sfm_detect (FILE *file)
920 if (fread (magic, 4, 1, file) != 1)
924 return (!strcmp (ASCII_MAGIC, magic)
925 || !strcmp (ASCII_ZMAGIC, magic)
926 || !strcmp (EBCDIC_MAGIC, magic));
929 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
930 except for the string fields in *INFO, which parse_header() will initialize
931 later once the file's encoding is known. */
933 read_header (struct sfm_reader *r, struct sfm_read_info *info,
934 struct sfm_header_record *header)
936 uint8_t raw_layout_code[4];
941 if (!read_string (r, header->magic, sizeof header->magic)
942 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
945 if (!strcmp (ASCII_MAGIC, header->magic)
946 || !strcmp (EBCDIC_MAGIC, header->magic))
948 else if (!strcmp (ASCII_ZMAGIC, header->magic))
952 sys_error (r, 0, _("This is not an SPSS system file."));
956 /* Identify integer format. */
957 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
959 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
961 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
963 || (r->integer_format != INTEGER_MSB_FIRST
964 && r->integer_format != INTEGER_LSB_FIRST))
966 sys_error (r, 64, _("This is not an SPSS system file."));
970 if (!read_int (r, &header->nominal_case_size))
973 if (header->nominal_case_size < 0
974 || header->nominal_case_size > INT_MAX / 16)
975 header->nominal_case_size = -1;
977 if (!read_int (r, &compressed))
982 r->compression = SFM_COMP_NONE;
983 else if (compressed == 1)
984 r->compression = SFM_COMP_SIMPLE;
985 else if (compressed != 0)
987 sys_error (r, 0, "System file header has invalid compression "
988 "value %d.", compressed);
995 r->compression = SFM_COMP_ZLIB;
998 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
999 "compression value %d.", compressed);
1004 if (!read_int (r, &header->weight_idx))
1007 if (!read_int (r, &r->case_cnt))
1009 if ( r->case_cnt > INT_MAX / 2)
1012 /* Identify floating-point format and obtain compression bias. */
1013 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1015 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1017 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1019 if (memcmp (raw_bias, zero_bias, 8))
1020 sys_warn (r, r->pos - 8,
1021 _("Compression bias is not the usual "
1022 "value of 100, or system file uses unrecognized "
1023 "floating-point format."));
1026 /* Some software is known to write all-zeros to this
1027 field. Such software also writes floating-point
1028 numbers in the format that we expect by default
1029 (it seems that all software most likely does, in
1030 reality), so don't warn in this case. */
1033 if (r->integer_format == INTEGER_MSB_FIRST)
1034 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1036 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1038 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1040 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1041 || !read_string (r, header->creation_time, sizeof header->creation_time)
1042 || !read_string (r, header->file_label, sizeof header->file_label)
1043 || !skip_bytes (r, 3))
1046 info->integer_format = r->integer_format;
1047 info->float_format = r->float_format;
1048 info->compression = r->compression;
1049 info->case_cnt = r->case_cnt;
1054 /* Reads a variable (type 2) record from R into RECORD. */
1056 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1058 int has_variable_label;
1060 memset (record, 0, sizeof *record);
1062 record->pos = r->pos;
1063 if (!read_int (r, &record->width)
1064 || !read_int (r, &has_variable_label)
1065 || !read_int (r, &record->missing_value_code)
1066 || !read_int (r, &record->print_format)
1067 || !read_int (r, &record->write_format)
1068 || !read_string (r, record->name, sizeof record->name))
1071 if (has_variable_label == 1)
1073 enum { MAX_LABEL_LEN = 255 };
1074 unsigned int len, read_len;
1076 if (!read_uint (r, &len))
1079 /* Read up to MAX_LABEL_LEN bytes of label. */
1080 read_len = MIN (MAX_LABEL_LEN, len);
1081 record->label = pool_malloc (r->pool, read_len + 1);
1082 if (!read_string (r, record->label, read_len + 1))
1085 /* Skip unread label bytes. */
1086 if (!skip_bytes (r, len - read_len))
1089 /* Skip label padding up to multiple of 4 bytes. */
1090 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1093 else if (has_variable_label != 0)
1095 sys_error (r, record->pos,
1096 _("Variable label indicator field is not 0 or 1."));
1100 /* Set missing values. */
1101 if (record->missing_value_code != 0)
1103 int code = record->missing_value_code;
1104 if (record->width == 0)
1106 if (code < -3 || code > 3 || code == -1)
1108 sys_error (r, record->pos,
1109 _("Numeric missing value indicator field is not "
1110 "-3, -2, 0, 1, 2, or 3."));
1116 if (code < 1 || code > 3)
1118 sys_error (r, record->pos,
1119 _("String missing value indicator field is not "
1125 if (!read_bytes (r, record->missing, 8 * abs (code)))
1132 /* Reads value labels from R into RECORD. */
1134 read_value_label_record (struct sfm_reader *r,
1135 struct sfm_value_label_record *record)
1140 /* Read type 3 record. */
1141 record->pos = r->pos;
1142 if (!read_uint (r, &record->n_labels))
1144 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1146 sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
1150 record->labels = pool_nmalloc (r->pool, record->n_labels,
1151 sizeof *record->labels);
1152 for (i = 0; i < record->n_labels; i++)
1154 struct sfm_value_label *label = &record->labels[i];
1155 unsigned char label_len;
1158 if (!read_bytes (r, label->value, sizeof label->value))
1161 /* Read label length. */
1162 if (!read_bytes (r, &label_len, sizeof label_len))
1164 padded_len = ROUND_UP (label_len + 1, 8);
1166 /* Read label, padding. */
1167 label->label = pool_malloc (r->pool, padded_len + 1);
1168 if (!read_bytes (r, label->label, padded_len - 1))
1170 label->label[label_len] = '\0';
1173 /* Read record type of type 4 record. */
1174 if (!read_int (r, &type))
1178 sys_error (r, r->pos - 4,
1179 _("Variable index record (type 4) does not immediately "
1180 "follow value label record (type 3) as it should."));
1184 /* Read number of variables associated with value label from type 4
1186 if (!read_uint (r, &record->n_vars))
1188 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1190 sys_error (r, r->pos - 4,
1191 _("Number of variables associated with a value label (%u) "
1192 "is not between 1 and the number of variables (%zu)."),
1193 record->n_vars, r->n_vars);
1197 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1198 for (i = 0; i < record->n_vars; i++)
1199 if (!read_int (r, &record->vars[i]))
1205 /* Reads a document record from R and returns it. */
1206 static struct sfm_document_record *
1207 read_document_record (struct sfm_reader *r)
1209 struct sfm_document_record *record;
1212 record = pool_malloc (r->pool, sizeof *record);
1213 record->pos = r->pos;
1215 if (!read_int (r, &n_lines))
1217 if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1219 sys_error (r, record->pos,
1220 _("Number of document lines (%d) "
1221 "must be greater than 0 and less than %d."),
1222 n_lines, INT_MAX / DOC_LINE_LENGTH);
1226 record->n_lines = n_lines;
1227 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1228 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1235 read_extension_record_header (struct sfm_reader *r, int subtype,
1236 struct sfm_extension_record *record)
1238 record->subtype = subtype;
1239 record->pos = r->pos;
1240 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1243 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1244 allows an extra byte for a null terminator, used by some
1245 extension processing routines. */
1246 if (record->size != 0
1247 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1249 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1257 /* Reads an extension record from R into RECORD. */
1259 read_extension_record (struct sfm_reader *r, int subtype,
1260 struct sfm_extension_record **recordp)
1262 struct extension_record_type
1269 static const struct extension_record_type types[] =
1271 /* Implemented record types. */
1272 { EXT_INTEGER, 4, 8 },
1273 { EXT_FLOAT, 8, 3 },
1274 { EXT_MRSETS, 1, 0 },
1275 { EXT_PRODUCT_INFO, 1, 0 },
1276 { EXT_DISPLAY, 4, 0 },
1277 { EXT_LONG_NAMES, 1, 0 },
1278 { EXT_LONG_STRINGS, 1, 0 },
1279 { EXT_NCASES, 8, 2 },
1280 { EXT_FILE_ATTRS, 1, 0 },
1281 { EXT_VAR_ATTRS, 1, 0 },
1282 { EXT_MRSETS2, 1, 0 },
1283 { EXT_ENCODING, 1, 0 },
1284 { EXT_LONG_LABELS, 1, 0 },
1285 { EXT_LONG_MISSING, 1, 0 },
1287 /* Ignored record types. */
1288 { EXT_VAR_SETS, 0, 0 },
1290 { EXT_DATA_ENTRY, 0, 0 },
1291 { EXT_DATAVIEW, 0, 0 },
1294 const struct extension_record_type *type;
1295 struct sfm_extension_record *record;
1299 record = pool_malloc (r->pool, sizeof *record);
1300 if (!read_extension_record_header (r, subtype, record))
1302 n_bytes = record->count * record->size;
1304 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1305 if (subtype == type->subtype)
1307 if (type->size > 0 && record->size != type->size)
1308 sys_warn (r, record->pos,
1309 _("Record type 7, subtype %d has bad size %zu "
1310 "(expected %d)."), subtype, record->size, type->size);
1311 else if (type->count > 0 && record->count != type->count)
1312 sys_warn (r, record->pos,
1313 _("Record type 7, subtype %d has bad count %zu "
1314 "(expected %d)."), subtype, record->count, type->count);
1315 else if (type->count == 0 && type->size == 0)
1317 /* Ignore this record. */
1321 char *data = pool_malloc (r->pool, n_bytes + 1);
1322 data[n_bytes] = '\0';
1324 record->data = data;
1325 if (!read_bytes (r, record->data, n_bytes))
1334 sys_warn (r, record->pos,
1335 _("Unrecognized record type 7, subtype %d. Please send a "
1336 "copy of this file, and the syntax which created it to %s."),
1337 subtype, PACKAGE_BUGREPORT);
1340 return skip_bytes (r, n_bytes);
1344 skip_extension_record (struct sfm_reader *r, int subtype)
1346 struct sfm_extension_record record;
1348 return (read_extension_record_header (r, subtype, &record)
1349 && skip_bytes (r, record.count * record.size));
1353 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1354 struct sfm_read_info *info, struct dictionary *dict)
1356 const char *dict_encoding = dict_get_encoding (dict);
1357 struct substring product;
1358 struct substring label;
1361 /* Convert file label to UTF-8 and put it into DICT. */
1362 label = recode_substring_pool ("UTF-8", dict_encoding,
1363 ss_cstr (header->file_label), r->pool);
1364 ss_trim (&label, ss_cstr (" "));
1365 label.string[label.length] = '\0';
1366 fixed_label = fix_line_ends (label.string);
1367 dict_set_label (dict, fixed_label);
1370 /* Put creation date and time in UTF-8 into INFO. */
1371 info->creation_date = recode_string ("UTF-8", dict_encoding,
1372 header->creation_date, -1);
1373 info->creation_time = recode_string ("UTF-8", dict_encoding,
1374 header->creation_time, -1);
1376 /* Put product name into INFO, dropping eye-catcher string if present. */
1377 product = recode_substring_pool ("UTF-8", dict_encoding,
1378 ss_cstr (header->eye_catcher), r->pool);
1379 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1380 ss_trim (&product, ss_cstr (" "));
1381 info->product = ss_xstrdup (product);
1384 /* Reads a variable (type 2) record from R and adds the
1385 corresponding variable to DICT.
1386 Also skips past additional variable records for long string
1389 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1390 struct sfm_var_record *var_recs, size_t n_var_recs)
1392 const char *dict_encoding = dict_get_encoding (dict);
1393 struct sfm_var_record *rec;
1396 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1398 struct variable *var;
1403 name = recode_string_pool ("UTF-8", dict_encoding,
1404 rec->name, -1, r->pool);
1405 name[strcspn (name, " ")] = '\0';
1407 if (!dict_id_is_valid (dict, name, false)
1408 || name[0] == '$' || name[0] == '#')
1410 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1414 if (rec->width < 0 || rec->width > 255)
1416 sys_error (r, rec->pos,
1417 _("Bad width %d for variable %s."), rec->width, name);
1421 var = rec->var = dict_create_var (dict, name, rec->width);
1424 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1425 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1428 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1432 /* Set the short name the same as the long name. */
1433 var_set_short_name (var, 0, name);
1435 /* Get variable label, if any. */
1440 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1441 rec->label, -1, r->pool);
1442 var_set_label (var, utf8_label, false);
1445 /* Set missing values. */
1446 if (rec->missing_value_code != 0)
1448 int width = var_get_width (var);
1449 struct missing_values mv;
1451 mv_init_pool (r->pool, &mv, width);
1452 if (var_is_numeric (var))
1454 bool has_range = rec->missing_value_code < 0;
1455 int n_discrete = (has_range
1456 ? rec->missing_value_code == -3
1457 : rec->missing_value_code);
1462 double low = parse_float (r, rec->missing, 0);
1463 double high = parse_float (r, rec->missing, 8);
1465 /* Deal with SPSS 21 change in representation. */
1469 mv_add_range (&mv, low, high);
1473 for (i = 0; i < n_discrete; i++)
1475 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1483 value_init_pool (r->pool, &value, width);
1484 value_set_missing (&value, width);
1485 for (i = 0; i < rec->missing_value_code; i++)
1486 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1488 var_set_missing_values (var, &mv);
1492 parse_format_spec (r, rec->pos + 12, rec->print_format,
1493 PRINT_FORMAT, var, &n_warnings);
1494 parse_format_spec (r, rec->pos + 16, rec->write_format,
1495 WRITE_FORMAT, var, &n_warnings);
1497 /* Account for values.
1498 Skip long string continuation records, if any. */
1499 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1500 for (i = 1; i < n_values; i++)
1501 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1503 sys_error (r, rec->pos, _("Missing string continuation record."));
1512 /* Translates the format spec from sysfile format to internal
1515 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1516 enum which_format which, struct variable *v,
1519 const int max_warnings = 8;
1520 uint8_t raw_type = format >> 16;
1521 uint8_t w = format >> 8;
1530 ok = (fmt_from_io (raw_type, &f.type)
1531 && fmt_check_output (&f)
1532 && fmt_check_width_compat (&f, var_get_width (v)));
1537 if (which == PRINT_FORMAT)
1538 var_set_print_format (v, &f);
1540 var_set_write_format (v, &f);
1542 else if (format == 0)
1544 /* Actually observed in the wild. No point in warning about it. */
1546 else if (++*n_warnings <= max_warnings)
1548 if (which == PRINT_FORMAT)
1549 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1551 var_get_name (v), var_get_width (v), format);
1553 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1555 var_get_name (v), var_get_width (v), format);
1557 if (*n_warnings == max_warnings)
1558 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1563 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1567 for (p = record->documents;
1568 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1569 p += DOC_LINE_LENGTH)
1571 struct substring line;
1573 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1574 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1575 ss_rtrim (&line, ss_cstr (" "));
1576 line.string[line.length] = '\0';
1578 dict_add_document_line (dict, line.string, false);
1584 /* Parses record type 7, subtype 3. */
1586 parse_machine_integer_info (struct sfm_reader *r,
1587 const struct sfm_extension_record *record,
1588 struct sfm_read_info *info)
1590 int float_representation, expected_float_format;
1591 int integer_representation, expected_integer_format;
1593 /* Save version info. */
1594 info->version_major = parse_int (r, record->data, 0);
1595 info->version_minor = parse_int (r, record->data, 4);
1596 info->version_revision = parse_int (r, record->data, 8);
1598 /* Check floating point format. */
1599 float_representation = parse_int (r, record->data, 16);
1600 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1601 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1602 expected_float_format = 1;
1603 else if (r->float_format == FLOAT_Z_LONG)
1604 expected_float_format = 2;
1605 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1606 expected_float_format = 3;
1609 if (float_representation != expected_float_format)
1611 sys_error (r, record->pos,
1612 _("Floating-point representation indicated by "
1613 "system file (%d) differs from expected (%d)."),
1614 float_representation, expected_float_format);
1618 /* Check integer format. */
1619 integer_representation = parse_int (r, record->data, 24);
1620 if (r->integer_format == INTEGER_MSB_FIRST)
1621 expected_integer_format = 1;
1622 else if (r->integer_format == INTEGER_LSB_FIRST)
1623 expected_integer_format = 2;
1626 if (integer_representation != expected_integer_format)
1627 sys_warn (r, record->pos,
1628 _("Integer format indicated by system file (%d) "
1629 "differs from expected (%d)."),
1630 integer_representation, expected_integer_format);
1635 /* Parses record type 7, subtype 4. */
1637 parse_machine_float_info (struct sfm_reader *r,
1638 const struct sfm_extension_record *record)
1640 double sysmis = parse_float (r, record->data, 0);
1641 double highest = parse_float (r, record->data, 8);
1642 double lowest = parse_float (r, record->data, 16);
1644 if (sysmis != SYSMIS)
1645 sys_warn (r, record->pos,
1646 _("File specifies unexpected value %g (%a) as %s, "
1647 "instead of %g (%a)."),
1648 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1650 if (highest != HIGHEST)
1651 sys_warn (r, record->pos,
1652 _("File specifies unexpected value %g (%a) as %s, "
1653 "instead of %g (%a)."),
1654 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1656 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1657 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1658 appears in a context (missing values) where SYSMIS cannot. */
1659 if (lowest != LOWEST && lowest != SYSMIS)
1660 sys_warn (r, record->pos,
1661 _("File specifies unexpected value %g (%a) as %s, "
1662 "instead of %g (%a) or %g (%a)."),
1663 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1666 /* Parses record type 7, subtype 10. */
1668 parse_extra_product_info (struct sfm_reader *r,
1669 const struct sfm_extension_record *record,
1670 struct sfm_read_info *info)
1672 struct text_record *text;
1674 text = open_text_record (r, record, true);
1675 info->product_ext = fix_line_ends (text_get_all (text));
1676 close_text_record (r, text);
1679 /* Parses record type 7, subtype 7 or 19. */
1681 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1682 size_t *allocated_mrsets)
1684 struct text_record *text;
1686 text = open_text_record (r, record, false);
1689 struct sfm_mrset *mrset;
1690 size_t allocated_vars;
1693 /* Skip extra line feeds if present. */
1694 while (text_match (text, '\n'))
1697 if (r->n_mrsets >= *allocated_mrsets)
1698 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1700 mrset = &r->mrsets[r->n_mrsets];
1701 memset(mrset, 0, sizeof *mrset);
1703 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1704 if (mrset->name == NULL)
1707 if (text_match (text, 'C'))
1709 mrset->type = MRSET_MC;
1710 if (!text_match (text, ' '))
1712 sys_warn (r, record->pos,
1713 _("Missing space following `%c' at offset %zu "
1714 "in MRSETS record."), 'C', text_pos (text));
1718 else if (text_match (text, 'D'))
1720 mrset->type = MRSET_MD;
1721 mrset->cat_source = MRSET_VARLABELS;
1723 else if (text_match (text, 'E'))
1727 mrset->type = MRSET_MD;
1728 mrset->cat_source = MRSET_COUNTEDVALUES;
1729 if (!text_match (text, ' '))
1731 sys_warn (r, record->pos,
1732 _("Missing space following `%c' at offset %zu "
1733 "in MRSETS record."), 'E', text_pos (text));
1737 number = text_get_token (text, ss_cstr (" "), NULL);
1738 if (!strcmp (number, "11"))
1739 mrset->label_from_var_label = true;
1740 else if (strcmp (number, "1"))
1741 sys_warn (r, record->pos,
1742 _("Unexpected label source value following `E' "
1743 "at offset %zu in MRSETS record."),
1748 sys_warn (r, record->pos,
1749 _("Missing `C', `D', or `E' at offset %zu "
1750 "in MRSETS record."),
1755 if (mrset->type == MRSET_MD)
1757 mrset->counted = text_parse_counted_string (r, text);
1758 if (mrset->counted == NULL)
1762 mrset->label = text_parse_counted_string (r, text);
1763 if (mrset->label == NULL)
1771 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1774 if (delimiter != '\n')
1775 sys_warn (r, record->pos,
1776 _("Missing new-line parsing variable names "
1777 "at offset %zu in MRSETS record."),
1782 if (mrset->n_vars >= allocated_vars)
1783 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1785 sizeof *mrset->vars);
1786 mrset->vars[mrset->n_vars++] = var;
1788 while (delimiter != '\n');
1792 close_text_record (r, text);
1796 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1798 const struct sfm_mrset *s;
1800 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1802 struct stringi_set var_names;
1803 struct mrset *mrset;
1808 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1811 sys_warn (r, -1, _("Multiple response set name `%s' does not begin "
1818 mrset = xzalloc (sizeof *mrset);
1820 mrset->type = s->type;
1821 mrset->cat_source = s->cat_source;
1822 mrset->label_from_var_label = s->label_from_var_label;
1823 if (s->label[0] != '\0')
1824 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1826 stringi_set_init (&var_names);
1827 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1829 for (i = 0; i < s->n_vars; i++)
1831 struct variable *var;
1834 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1836 var = dict_lookup_var (dict, var_name);
1842 if (!stringi_set_insert (&var_names, var_name))
1845 _("MRSET %s contains duplicate variable name %s."),
1846 mrset->name, var_name);
1852 if (mrset->label == NULL && mrset->label_from_var_label
1853 && var_has_label (var))
1854 mrset->label = xstrdup (var_get_label (var));
1857 && var_get_type (var) != var_get_type (mrset->vars[0]))
1860 _("MRSET %s contains both string and "
1861 "numeric variables."), mrset->name);
1864 width = MIN (width, var_get_width (var));
1866 mrset->vars[mrset->n_vars++] = var;
1869 if (mrset->n_vars < 2)
1871 if (mrset->n_vars == 0)
1872 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1874 sys_warn (r, -1, _("MRSET %s has only one variable."),
1876 mrset_destroy (mrset);
1877 stringi_set_destroy (&var_names);
1881 if (mrset->type == MRSET_MD)
1883 mrset->width = width;
1884 value_init (&mrset->counted, width);
1886 mrset->counted.f = c_strtod (s->counted, NULL);
1888 value_copy_str_rpad (&mrset->counted, width,
1889 (const uint8_t *) s->counted, ' ');
1892 dict_add_mrset (dict, mrset);
1893 stringi_set_destroy (&var_names);
1897 /* Read record type 7, subtype 11, which specifies how variables
1898 should be displayed in GUI environments. */
1900 parse_display_parameters (struct sfm_reader *r,
1901 const struct sfm_extension_record *record,
1902 struct dictionary *dict)
1904 bool includes_width;
1905 bool warned = false;
1910 n_vars = dict_get_var_cnt (dict);
1911 if (record->count == 3 * n_vars)
1912 includes_width = true;
1913 else if (record->count == 2 * n_vars)
1914 includes_width = false;
1917 sys_warn (r, record->pos,
1918 _("Extension 11 has bad count %zu (for %zu variables)."),
1919 record->count, n_vars);
1924 for (i = 0; i < n_vars; ++i)
1926 struct variable *v = dict_get_var (dict, i);
1927 int measure, width, align;
1929 measure = parse_int (r, record->data, ofs);
1934 width = parse_int (r, record->data, ofs);
1940 align = parse_int (r, record->data, ofs);
1943 /* SPSS sometimes seems to set variables' measure to zero. */
1947 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1950 sys_warn (r, record->pos,
1951 _("Invalid variable display parameters for variable "
1952 "%zu (%s). Default parameters substituted."),
1953 i, var_get_name (v));
1958 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1959 : measure == 2 ? MEASURE_ORDINAL
1961 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1962 : align == 1 ? ALIGN_RIGHT
1965 /* Older versions (SPSS 9.0) sometimes set the display
1966 width to zero. This causes confusion in the GUI, so
1967 only set the width if it is nonzero. */
1969 var_set_display_width (v, width);
1974 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1975 const char *new_name)
1977 size_t n_short_names;
1981 /* Renaming a variable may clear its short names, but we
1982 want to retain them, so we save them and re-set them
1984 n_short_names = var_get_short_name_cnt (var);
1985 short_names = xnmalloc (n_short_names, sizeof *short_names);
1986 for (i = 0; i < n_short_names; i++)
1988 const char *s = var_get_short_name (var, i);
1989 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1992 /* Set long name. */
1993 dict_rename_var (dict, var, new_name);
1995 /* Restore short names. */
1996 for (i = 0; i < n_short_names; i++)
1998 var_set_short_name (var, i, short_names[i]);
1999 free (short_names[i]);
2004 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2005 to each short name. Modifies variable names in DICT accordingly. */
2007 parse_long_var_name_map (struct sfm_reader *r,
2008 const struct sfm_extension_record *record,
2009 struct dictionary *dict)
2011 struct text_record *text;
2012 struct variable *var;
2017 /* There are no long variable names. Use the short variable names,
2018 converted to lowercase, as the long variable names. */
2021 for (i = 0; i < dict_get_var_cnt (dict); i++)
2023 struct variable *var = dict_get_var (dict, i);
2026 new_name = utf8_to_lower (var_get_name (var));
2027 rename_var_and_save_short_names (dict, var, new_name);
2034 /* Rename each of the variables, one by one. (In a correctly constructed
2035 system file, this cannot create any intermediate duplicate variable names,
2036 because all of the new variable names are longer than any of the old
2037 variable names and thus there cannot be any overlaps.) */
2038 text = open_text_record (r, record, true);
2039 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2041 /* Validate long name. */
2042 if (!dict_id_is_valid (dict, long_name, false))
2044 sys_warn (r, record->pos,
2045 _("Long variable mapping from %s to invalid "
2046 "variable name `%s'."),
2047 var_get_name (var), long_name);
2051 /* Identify any duplicates. */
2052 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
2053 && dict_lookup_var (dict, long_name) != NULL)
2055 sys_warn (r, record->pos,
2056 _("Duplicate long variable name `%s'."), long_name);
2060 rename_var_and_save_short_names (dict, var, long_name);
2062 close_text_record (r, text);
2065 /* Reads record type 7, subtype 14, which gives the real length
2066 of each very long string. Rearranges DICT accordingly. */
2068 parse_long_string_map (struct sfm_reader *r,
2069 const struct sfm_extension_record *record,
2070 struct dictionary *dict)
2072 struct text_record *text;
2073 struct variable *var;
2076 text = open_text_record (r, record, true);
2077 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2079 size_t idx = var_get_dict_index (var);
2085 length = strtol (length_s, NULL, 10);
2086 if (length < 1 || length > MAX_STRING)
2088 sys_warn (r, record->pos,
2089 _("%s listed as string of invalid length %s "
2090 "in very long string record."),
2091 var_get_name (var), length_s);
2095 /* Check segments. */
2096 segment_cnt = sfm_width_to_segments (length);
2097 if (segment_cnt == 1)
2099 sys_warn (r, record->pos,
2100 _("%s listed in very long string record with width %s, "
2101 "which requires only one segment."),
2102 var_get_name (var), length_s);
2105 if (idx + segment_cnt > dict_get_var_cnt (dict))
2107 sys_error (r, record->pos,
2108 _("Very long string %s overflows dictionary."),
2109 var_get_name (var));
2113 /* Get the short names from the segments and check their
2115 for (i = 0; i < segment_cnt; i++)
2117 struct variable *seg = dict_get_var (dict, idx + i);
2118 int alloc_width = sfm_segment_alloc_width (length, i);
2119 int width = var_get_width (seg);
2122 var_set_short_name (var, i, var_get_short_name (seg, 0));
2123 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2125 sys_error (r, record->pos,
2126 _("Very long string with width %ld has segment %d "
2127 "of width %d (expected %d)."),
2128 length, i, width, alloc_width);
2132 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
2133 var_set_width (var, length);
2135 close_text_record (r, text);
2136 dict_compact_values (dict);
2142 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
2143 const struct sfm_var_record *var_recs, size_t n_var_recs,
2144 const struct sfm_value_label_record *record)
2146 struct variable **vars;
2150 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2151 for (i = 0; i < record->n_labels; i++)
2152 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2153 record->labels[i].label, -1,
2156 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
2157 for (i = 0; i < record->n_vars; i++)
2159 vars[i] = lookup_var_by_index (r, record->pos,
2160 var_recs, n_var_recs, record->vars[i]);
2161 if (vars[i] == NULL)
2165 for (i = 1; i < record->n_vars; i++)
2166 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2168 sys_error (r, record->pos,
2169 _("Variables associated with value label are not all of "
2170 "identical type. Variable %s is %s, but variable "
2172 var_get_name (vars[0]),
2173 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2174 var_get_name (vars[i]),
2175 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2179 for (i = 0; i < record->n_vars; i++)
2181 struct variable *var = vars[i];
2185 width = var_get_width (var);
2188 sys_error (r, record->pos,
2189 _("Value labels may not be added to long string "
2190 "variables (e.g. %s) using records types 3 and 4."),
2191 var_get_name (var));
2195 for (j = 0; j < record->n_labels; j++)
2197 struct sfm_value_label *label = &record->labels[j];
2200 value_init (&value, width);
2202 value.f = parse_float (r, label->value, 0);
2204 memcpy (value_str_rw (&value, width), label->value, width);
2206 if (!var_add_value_label (var, &value, utf8_labels[j]))
2208 if (var_is_numeric (var))
2209 sys_warn (r, record->pos,
2210 _("Duplicate value label for %g on %s."),
2211 value.f, var_get_name (var));
2213 sys_warn (r, record->pos,
2214 _("Duplicate value label for `%.*s' on %s."),
2215 width, value_str (&value, width),
2216 var_get_name (var));
2219 value_destroy (&value, width);
2223 pool_free (r->pool, vars);
2224 for (i = 0; i < record->n_labels; i++)
2225 pool_free (r->pool, utf8_labels[i]);
2226 pool_free (r->pool, utf8_labels);
2231 static struct variable *
2232 lookup_var_by_index (struct sfm_reader *r, off_t offset,
2233 const struct sfm_var_record *var_recs, size_t n_var_recs,
2236 const struct sfm_var_record *rec;
2238 if (idx < 1 || idx > n_var_recs)
2240 sys_error (r, offset,
2241 _("Variable index %d not in valid range 1...%zu."),
2246 rec = &var_recs[idx - 1];
2247 if (rec->var == NULL)
2249 sys_error (r, offset,
2250 _("Variable index %d refers to long string continuation."),
2258 /* Parses a set of custom attributes from TEXT into ATTRS.
2259 ATTRS may be a null pointer, in which case the attributes are
2260 read but discarded. */
2262 parse_attributes (struct sfm_reader *r, struct text_record *text,
2263 struct attrset *attrs)
2267 struct attribute *attr;
2271 /* Parse the key. */
2272 key = text_get_token (text, ss_cstr ("("), NULL);
2276 attr = attribute_create (key);
2277 for (index = 1; ; index++)
2279 /* Parse the value. */
2283 value = text_get_token (text, ss_cstr ("\n"), NULL);
2286 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2291 length = strlen (value);
2292 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2294 value[length - 1] = '\0';
2295 attribute_add_value (attr, value + 1);
2300 _("Attribute value %s[%d] is not quoted: %s."),
2302 attribute_add_value (attr, value);
2305 /* Was this the last value for this attribute? */
2306 if (text_match (text, ')'))
2310 attrset_add (attrs, attr);
2312 attribute_destroy (attr);
2314 while (!text_match (text, '/'));
2317 /* Reads record type 7, subtype 17, which lists custom
2318 attributes on the data file. */
2320 parse_data_file_attributes (struct sfm_reader *r,
2321 const struct sfm_extension_record *record,
2322 struct dictionary *dict)
2324 struct text_record *text = open_text_record (r, record, true);
2325 parse_attributes (r, text, dict_get_attributes (dict));
2326 close_text_record (r, text);
2329 /* Parses record type 7, subtype 18, which lists custom
2330 attributes on individual variables. */
2332 parse_variable_attributes (struct sfm_reader *r,
2333 const struct sfm_extension_record *record,
2334 struct dictionary *dict)
2336 struct text_record *text;
2337 struct variable *var;
2339 text = open_text_record (r, record, true);
2340 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2341 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2342 close_text_record (r, text);
2346 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2348 size_t n_warnings = 0;
2351 for (i = 0; i < dict_get_var_cnt (dict); i++)
2353 struct variable *var = dict_get_var (dict, i);
2354 struct attrset *attrs = var_get_attributes (var);
2355 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2358 int value = atoi (attribute_get_value (attr, 0));
2380 role = ROLE_PARTITION;
2389 if (n_warnings++ == 0)
2390 sys_warn (r, -1, _("Invalid role for variable %s."),
2391 var_get_name (var));
2394 var_set_role (var, role);
2399 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2404 check_overflow (struct sfm_reader *r,
2405 const struct sfm_extension_record *record,
2406 size_t ofs, size_t length)
2408 size_t end = record->size * record->count;
2409 if (length >= end || ofs + length > end)
2411 sys_error (r, record->pos + end,
2412 _("Extension record subtype %d ends unexpectedly."),
2420 parse_long_string_value_labels (struct sfm_reader *r,
2421 const struct sfm_extension_record *record,
2422 struct dictionary *dict)
2424 const char *dict_encoding = dict_get_encoding (dict);
2425 size_t end = record->size * record->count;
2432 struct variable *var;
2437 /* Parse variable name length. */
2438 if (!check_overflow (r, record, ofs, 4))
2440 var_name_len = parse_int (r, record->data, ofs);
2443 /* Parse variable name, width, and number of labels. */
2444 if (!check_overflow (r, record, ofs, var_name_len + 8))
2446 var_name = recode_string_pool ("UTF-8", dict_encoding,
2447 (const char *) record->data + ofs,
2448 var_name_len, r->pool);
2449 width = parse_int (r, record->data, ofs + var_name_len);
2450 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2451 ofs += var_name_len + 8;
2453 /* Look up 'var' and validate. */
2454 var = dict_lookup_var (dict, var_name);
2456 sys_warn (r, record->pos + ofs,
2457 _("Ignoring long string value label record for "
2458 "unknown variable %s."), var_name);
2459 else if (var_is_numeric (var))
2461 sys_warn (r, record->pos + ofs,
2462 _("Ignoring long string value label record for "
2463 "numeric variable %s."), var_name);
2466 else if (width != var_get_width (var))
2468 sys_warn (r, record->pos + ofs,
2469 _("Ignoring long string value label record for variable "
2470 "%s because the record's width (%d) does not match the "
2471 "variable's width (%d)."),
2472 var_name, width, var_get_width (var));
2477 value_init_pool (r->pool, &value, width);
2478 for (i = 0; i < n_labels; i++)
2480 size_t value_length, label_length;
2481 bool skip = var == NULL;
2483 /* Parse value length. */
2484 if (!check_overflow (r, record, ofs, 4))
2486 value_length = parse_int (r, record->data, ofs);
2490 if (!check_overflow (r, record, ofs, value_length))
2494 if (value_length == width)
2495 memcpy (value_str_rw (&value, width),
2496 (const uint8_t *) record->data + ofs, width);
2499 sys_warn (r, record->pos + ofs,
2500 _("Ignoring long string value label %zu for "
2501 "variable %s, with width %d, that has bad value "
2503 i, var_get_name (var), width, value_length);
2507 ofs += value_length;
2509 /* Parse label length. */
2510 if (!check_overflow (r, record, ofs, 4))
2512 label_length = parse_int (r, record->data, ofs);
2516 if (!check_overflow (r, record, ofs, label_length))
2522 label = recode_string_pool ("UTF-8", dict_encoding,
2523 (const char *) record->data + ofs,
2524 label_length, r->pool);
2525 if (!var_add_value_label (var, &value, label))
2526 sys_warn (r, record->pos + ofs,
2527 _("Duplicate value label for `%.*s' on %s."),
2528 width, value_str (&value, width),
2529 var_get_name (var));
2530 pool_free (r->pool, label);
2532 ofs += label_length;
2540 parse_long_string_missing_values (struct sfm_reader *r,
2541 const struct sfm_extension_record *record,
2542 struct dictionary *dict)
2544 const char *dict_encoding = dict_get_encoding (dict);
2545 size_t end = record->size * record->count;
2550 struct missing_values mv;
2552 struct variable *var;
2553 int n_missing_values;
2557 /* Parse variable name length. */
2558 if (!check_overflow (r, record, ofs, 4))
2560 var_name_len = parse_int (r, record->data, ofs);
2563 /* Parse variable name. */
2564 if (!check_overflow (r, record, ofs, var_name_len + 1))
2566 var_name = recode_string_pool ("UTF-8", dict_encoding,
2567 (const char *) record->data + ofs,
2568 var_name_len, r->pool);
2569 ofs += var_name_len;
2571 /* Parse number of missing values. */
2572 n_missing_values = ((const uint8_t *) record->data)[ofs];
2573 if (n_missing_values < 1 || n_missing_values > 3)
2574 sys_warn (r, record->pos + ofs,
2575 _("Long string missing values record says variable %s "
2576 "has %d missing values, but only 1 to 3 missing values "
2578 var_name, n_missing_values);
2581 /* Look up 'var' and validate. */
2582 var = dict_lookup_var (dict, var_name);
2584 sys_warn (r, record->pos + ofs,
2585 _("Ignoring long string missing value record for "
2586 "unknown variable %s."), var_name);
2587 else if (var_is_numeric (var))
2589 sys_warn (r, record->pos + ofs,
2590 _("Ignoring long string missing value record for "
2591 "numeric variable %s."), var_name);
2596 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2597 for (i = 0; i < n_missing_values; i++)
2599 size_t value_length;
2601 /* Parse value length. */
2602 if (!check_overflow (r, record, ofs, 4))
2604 value_length = parse_int (r, record->data, ofs);
2608 if (!check_overflow (r, record, ofs, value_length))
2612 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2614 sys_warn (r, record->pos + ofs,
2615 _("Ignoring long string missing value %zu for variable "
2616 "%s, with width %d, that has bad value width %zu."),
2617 i, var_get_name (var), var_get_width (var),
2619 ofs += value_length;
2622 var_set_missing_values (var, &mv);
2630 static void partial_record (struct sfm_reader *);
2632 static void read_error (struct casereader *, const struct sfm_reader *);
2634 static bool read_case_number (struct sfm_reader *, double *);
2635 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2636 static int read_opcode (struct sfm_reader *);
2637 static bool read_compressed_number (struct sfm_reader *, double *);
2638 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2639 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2640 static bool skip_whole_strings (struct sfm_reader *, size_t);
2642 /* Reads and returns one case from READER's file. Returns a null
2643 pointer if not successful. */
2644 static struct ccase *
2645 sys_file_casereader_read (struct casereader *reader, void *r_)
2647 struct sfm_reader *r = r_;
2655 c = case_create (r->proto);
2657 for (i = 0; i < r->sfm_var_cnt; i++)
2659 struct sfm_var *sv = &r->sfm_vars[i];
2660 union value *v = case_data_rw_idx (c, sv->case_index);
2662 if (sv->var_width == 0)
2663 retval = read_case_number (r, &v->f);
2666 uint8_t *s = value_str_rw (v, sv->var_width);
2667 retval = read_case_string (r, s + sv->offset, sv->segment_width);
2670 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2672 sys_error (r, r->pos, _("File ends in partial string value."));
2684 if (r->case_cnt != -1)
2685 read_error (reader, r);
2690 /* Issues an error that R ends in a partial record. */
2692 partial_record (struct sfm_reader *r)
2694 sys_error (r, r->pos, _("File ends in partial case."));
2697 /* Issues an error that an unspecified error occurred SFM, and
2700 read_error (struct casereader *r, const struct sfm_reader *sfm)
2702 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2703 casereader_force_error (r);
2706 /* Reads a number from R and stores its value in *D.
2707 If R is compressed, reads a compressed number;
2708 otherwise, reads a number in the regular way.
2709 Returns true if successful, false if end of file is
2710 reached immediately. */
2712 read_case_number (struct sfm_reader *r, double *d)
2714 if (r->compression == SFM_COMP_NONE)
2717 if (!try_read_bytes (r, number, sizeof number))
2719 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2723 return read_compressed_number (r, d);
2726 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2727 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2728 discarded without being written to S. Reads compressed strings if S is
2729 compressed. Returns 1 if successful, 0 if end of file is reached
2730 immediately, or -1 for some kind of error. */
2732 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2734 size_t whole = ROUND_DOWN (length, 8);
2735 size_t partial = length % 8;
2739 int retval = read_whole_strings (r, s, whole);
2747 int retval = read_whole_strings (r, bounce, sizeof bounce);
2759 memcpy (s + whole, bounce, partial);
2765 /* Reads and returns the next compression opcode from R. */
2767 read_opcode (struct sfm_reader *r)
2769 assert (r->compression != SFM_COMP_NONE);
2773 if (r->opcode_idx >= sizeof r->opcodes)
2776 int retval = try_read_compressed_bytes (r, r->opcodes,
2782 opcode = r->opcodes[r->opcode_idx++];
2789 /* Reads a compressed number from R and stores its value in D.
2790 Returns true if successful, false if end of file is
2791 reached immediately. */
2793 read_compressed_number (struct sfm_reader *r, double *d)
2795 int opcode = read_opcode (r);
2803 return read_compressed_float (r, d);
2806 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2807 if (!r->corruption_warning)
2809 r->corruption_warning = true;
2810 sys_warn (r, r->pos,
2811 _("Possible compressed data corruption: "
2812 "compressed spaces appear in numeric field."));
2821 *d = opcode - r->bias;
2828 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2830 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2835 opcode = read_opcode (r);
2843 retval = read_compressed_bytes (r, dst, 8);
2844 return retval == 1 ? 1 : -1;
2847 memset (dst, ' ', 8);
2852 double value = opcode - r->bias;
2853 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2856 /* This has actually been seen "in the wild". The submitter of the
2857 file that showed that the contents decoded as spaces, but they
2858 were at the end of the field so it's possible that the null
2859 bytes just acted as null terminators. */
2861 else if (!r->corruption_warning)
2863 r->corruption_warning = true;
2864 sys_warn (r, r->pos,
2865 _("Possible compressed data corruption: "
2866 "string contains compressed integer (opcode %d)."),
2874 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2875 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2876 end of file is reached immediately, or -1 for some kind of error. */
2878 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2880 assert (length % 8 == 0);
2881 if (r->compression == SFM_COMP_NONE)
2882 return try_read_bytes (r, s, length);
2887 for (ofs = 0; ofs < length; ofs += 8)
2889 int retval = read_compressed_string (r, s + ofs);
2904 /* Skips LENGTH string bytes from R.
2905 LENGTH must be a multiple of 8.
2906 (LENGTH is also limited to 1024, but that's only because the
2907 current caller never needs more than that many bytes.)
2908 Returns true if successful, false if end of file is
2909 reached immediately. */
2911 skip_whole_strings (struct sfm_reader *r, size_t length)
2913 uint8_t buffer[1024];
2914 assert (length < sizeof buffer);
2915 return read_whole_strings (r, buffer, length);
2918 /* Helpers for reading records that contain structured text
2921 /* Maximum number of warnings to issue for a single text
2923 #define MAX_TEXT_WARNINGS 5
2928 struct substring buffer; /* Record contents. */
2929 off_t start; /* Starting offset in file. */
2930 size_t pos; /* Current position in buffer. */
2931 int n_warnings; /* Number of warnings issued or suppressed. */
2932 bool recoded; /* Recoded into UTF-8? */
2935 static struct text_record *
2936 open_text_record (struct sfm_reader *r,
2937 const struct sfm_extension_record *record,
2938 bool recode_to_utf8)
2940 struct text_record *text;
2941 struct substring raw;
2943 text = pool_alloc (r->pool, sizeof *text);
2944 raw = ss_buffer (record->data, record->size * record->count);
2945 text->start = record->pos;
2946 text->buffer = (recode_to_utf8
2947 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2950 text->n_warnings = 0;
2951 text->recoded = recode_to_utf8;
2956 /* Closes TEXT, frees its storage, and issues a final warning
2957 about suppressed warnings if necesary. */
2959 close_text_record (struct sfm_reader *r, struct text_record *text)
2961 if (text->n_warnings > MAX_TEXT_WARNINGS)
2962 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2963 text->n_warnings - MAX_TEXT_WARNINGS);
2965 pool_free (r->pool, ss_data (text->buffer));
2968 /* Reads a variable=value pair from TEXT.
2969 Looks up the variable in DICT and stores it into *VAR.
2970 Stores a null-terminated value into *VALUE. */
2972 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
2973 struct text_record *text,
2974 struct variable **var, char **value)
2978 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
2981 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
2985 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
2986 ss_buffer ("\t\0", 2));
2994 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
2995 struct text_record *text, struct substring delimiters,
2996 struct variable **var)
3000 name = text_get_token (text, delimiters, NULL);
3004 *var = dict_lookup_var (dict, name);
3008 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3015 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3016 struct text_record *text, struct substring delimiters,
3017 struct variable **var)
3019 char *short_name = text_get_token (text, delimiters, NULL);
3020 if (short_name == NULL)
3023 *var = dict_lookup_var (dict, short_name);
3025 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3030 /* Displays a warning for the current file position, limiting the
3031 number to MAX_TEXT_WARNINGS for TEXT. */
3033 text_warn (struct sfm_reader *r, struct text_record *text,
3034 const char *format, ...)
3036 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3040 va_start (args, format);
3041 sys_msg (r, text->start + text->pos, MW, format, args);
3047 text_get_token (struct text_record *text, struct substring delimiters,
3050 struct substring token;
3053 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3056 end = &ss_data (token)[ss_length (token)];
3057 if (delimiter != NULL)
3060 return ss_data (token);
3063 /* Reads a integer value expressed in decimal, then a space, then a string that
3064 consists of exactly as many bytes as specified by the integer, then a space,
3065 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3066 buffer (so the caller should not free the string). */
3068 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3076 while (text->pos < text->buffer.length)
3078 int c = text->buffer.string[text->pos];
3079 if (c < '0' || c > '9')
3081 n = (n * 10) + (c - '0');
3084 if (text->pos >= text->buffer.length || start == text->pos)
3086 sys_warn (r, text->start,
3087 _("Expecting digit at offset %zu in MRSETS record."),
3092 if (!text_match (text, ' '))
3094 sys_warn (r, text->start,
3095 _("Expecting space at offset %zu in MRSETS record."),
3100 if (text->pos + n > text->buffer.length)
3102 sys_warn (r, text->start,
3103 _("%zu-byte string starting at offset %zu "
3104 "exceeds record length %zu."),
3105 n, text->pos, text->buffer.length);
3109 s = &text->buffer.string[text->pos];
3112 sys_warn (r, text->start,
3113 _("Expecting space at offset %zu following %zu-byte string."),
3123 text_match (struct text_record *text, char c)
3125 if (text->buffer.string[text->pos] == c)
3134 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3135 inside the TEXT's string. */
3137 text_pos (const struct text_record *text)
3143 text_get_all (const struct text_record *text)
3145 return text->buffer.string;
3150 /* Displays a corruption message. */
3152 sys_msg (struct sfm_reader *r, off_t offset,
3153 int class, const char *format, va_list args)
3158 ds_init_empty (&text);
3160 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3161 fh_get_file_name (r->fh), (long long int) offset);
3163 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3164 ds_put_vformat (&text, format, args);
3166 m.category = msg_class_to_category (class);
3167 m.severity = msg_class_to_severity (class);
3173 m.text = ds_cstr (&text);
3178 /* Displays a warning for offset OFFSET in the file. */
3180 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3184 va_start (args, format);
3185 sys_msg (r, offset, MW, format, args);
3189 /* Displays an error for the current file position,
3190 marks it as in an error state,
3191 and aborts reading it using longjmp. */
3193 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3197 va_start (args, format);
3198 sys_msg (r, offset, ME, format, args);
3204 /* Reads BYTE_CNT bytes into BUF.
3205 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3206 Returns -1 if an I/O error or a partial read occurs.
3207 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3210 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3211 void *buf, size_t byte_cnt)
3213 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3214 r->pos += bytes_read;
3215 if (bytes_read == byte_cnt)
3217 else if (ferror (r->file))
3219 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3222 else if (!eof_is_ok || bytes_read != 0)
3224 sys_error (r, r->pos, _("Unexpected end of file."));
3231 /* Reads BYTE_CNT into BUF.
3232 Returns true if successful.
3233 Returns false upon I/O error or if end-of-file is encountered. */
3235 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3237 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3240 /* Reads BYTE_CNT bytes into BUF.
3241 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3242 Returns 0 if an immediate end-of-file is encountered.
3243 Returns -1 if an I/O error or a partial read occurs. */
3245 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3247 return read_bytes_internal (r, true, buf, byte_cnt);
3250 /* Reads a 32-bit signed integer from R and stores its value in host format in
3251 *X. Returns true if successful, otherwise false. */
3253 read_int (struct sfm_reader *r, int *x)
3256 if (read_bytes (r, integer, sizeof integer) != 1)
3258 *x = integer_get (r->integer_format, integer, sizeof integer);
3263 read_uint (struct sfm_reader *r, unsigned int *x)
3268 ok = read_int (r, &y);
3273 /* Reads a 64-bit signed integer from R and returns its value in
3276 read_int64 (struct sfm_reader *r, long long int *x)
3279 if (read_bytes (r, integer, sizeof integer) != 1)
3281 *x = integer_get (r->integer_format, integer, sizeof integer);
3285 /* Reads a 64-bit signed integer from R and returns its value in
3288 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3293 ok = read_int64 (r, &y);
3299 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3301 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3305 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3307 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3310 /* Reads exactly SIZE - 1 bytes into BUFFER
3311 and stores a null byte into BUFFER[SIZE - 1]. */
3313 read_string (struct sfm_reader *r, char *buffer, size_t size)
3318 ok = read_bytes (r, buffer, size - 1);
3320 buffer[size - 1] = '\0';
3324 /* Skips BYTES bytes forward in R. */
3326 skip_bytes (struct sfm_reader *r, size_t bytes)
3331 size_t chunk = MIN (sizeof buffer, bytes);
3332 if (!read_bytes (r, buffer, chunk))
3340 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3341 been replaced by LFs.
3343 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3344 files that use CR-only line ends in the file label and extra product
3347 fix_line_ends (const char *s)
3351 d = dst = xmalloc (strlen (s) + 1);
3370 read_ztrailer (struct sfm_reader *r,
3371 long long int zheader_ofs,
3372 long long int ztrailer_len);
3375 zalloc (voidpf pool_, uInt items, uInt size)
3377 struct pool *pool = pool_;
3379 return (!size || xalloc_oversized (items, size)
3381 : pool_malloc (pool, items * size));
3385 zfree (voidpf pool_, voidpf address)
3387 struct pool *pool = pool_;
3389 pool_free (pool, address);
3393 read_zheader (struct sfm_reader *r)
3396 long long int zheader_ofs;
3397 long long int ztrailer_ofs;
3398 long long int ztrailer_len;
3400 if (!read_int64 (r, &zheader_ofs)
3401 || !read_int64 (r, &ztrailer_ofs)
3402 || !read_int64 (r, &ztrailer_len))
3405 if (zheader_ofs != pos)
3407 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3408 "(expected %#llx)."),
3409 zheader_ofs, (long long int) pos);
3413 if (ztrailer_ofs < r->pos)
3415 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3420 if (ztrailer_len < 24 || ztrailer_len % 24)
3422 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3426 r->ztrailer_ofs = ztrailer_ofs;
3427 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3430 if (r->zin_buf == NULL)
3432 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3433 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3434 r->zstream.next_in = NULL;
3435 r->zstream.avail_in = 0;
3438 r->zstream.zalloc = zalloc;
3439 r->zstream.zfree = zfree;
3440 r->zstream.opaque = r->pool;
3442 return open_zstream (r);
3446 seek (struct sfm_reader *r, off_t offset)
3448 if (fseeko (r->file, offset, SEEK_SET))
3449 sys_error (r, 0, _("%s: seek failed (%s)."),
3450 fh_get_file_name (r->fh), strerror (errno));
3454 /* Performs some additional consistency checks on the ZLIB compressed data
3457 read_ztrailer (struct sfm_reader *r,
3458 long long int zheader_ofs,
3459 long long int ztrailer_len)
3461 long long int expected_uncmp_ofs;
3462 long long int expected_cmp_ofs;
3465 unsigned int block_size;
3466 unsigned int n_blocks;
3470 if (fstat (fileno (r->file), &s))
3472 sys_error (ME, 0, _("%s: stat failed (%s)."),
3473 fh_get_file_name (r->fh), strerror (errno));
3477 if (!S_ISREG (s.st_mode))
3479 /* We can't seek to the trailer and then back to the data in this file,
3480 so skip doing extra checks. */
3484 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3485 sys_warn (r, r->pos,
3486 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3487 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3489 seek (r, r->ztrailer_ofs);
3491 /* Read fixed header from ZLIB data trailer. */
3492 if (!read_int64 (r, &bias))
3494 if (-bias != r->bias)
3496 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3497 "file header bias (%.2f)."),
3502 if (!read_int64 (r, &zero))
3505 sys_warn (r, r->pos,
3506 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3508 if (!read_uint (r, &block_size))
3510 if (block_size != ZBLOCK_SIZE)
3511 sys_warn (r, r->pos,
3512 _("ZLIB trailer specifies unexpected %u-byte block size."),
3515 if (!read_uint (r, &n_blocks))
3517 if (n_blocks != (ztrailer_len - 24) / 24)
3519 sys_error (r, r->pos,
3520 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3522 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3526 expected_uncmp_ofs = zheader_ofs;
3527 expected_cmp_ofs = zheader_ofs + 24;
3528 for (i = 0; i < n_blocks; i++)
3530 off_t desc_ofs = r->pos;
3531 unsigned long long int uncompressed_ofs;
3532 unsigned long long int compressed_ofs;
3533 unsigned int uncompressed_size;
3534 unsigned int compressed_size;
3536 if (!read_uint64 (r, &uncompressed_ofs)
3537 || !read_uint64 (r, &compressed_ofs)
3538 || !read_uint (r, &uncompressed_size)
3539 || !read_uint (r, &compressed_size))
3542 if (uncompressed_ofs != expected_uncmp_ofs)
3544 sys_error (r, desc_ofs,
3545 _("ZLIB block descriptor %u reported uncompressed data "
3546 "offset %#llx, when %#llx was expected."),
3547 i, uncompressed_ofs, expected_uncmp_ofs);
3551 if (compressed_ofs != expected_cmp_ofs)
3553 sys_error (r, desc_ofs,
3554 _("ZLIB block descriptor %u reported compressed data "
3555 "offset %#llx, when %#llx was expected."),
3556 i, compressed_ofs, expected_cmp_ofs);
3560 if (i < n_blocks - 1)
3562 if (uncompressed_size != block_size)
3563 sys_warn (r, desc_ofs,
3564 _("ZLIB block descriptor %u reported block size %#x, "
3565 "when %#x was expected."),
3566 i, uncompressed_size, block_size);
3570 if (uncompressed_size > block_size)
3571 sys_warn (r, desc_ofs,
3572 _("ZLIB block descriptor %u reported block size %#x, "
3573 "when at most %#x was expected."),
3574 i, uncompressed_size, block_size);
3577 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3578 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3579 This code checks for an expansion of more than 14.3% plus 11
3581 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3583 sys_error (r, desc_ofs,
3584 _("ZLIB block descriptor %u reports compressed size %u "
3585 "and uncompressed size %u."),
3586 i, compressed_size, uncompressed_size);
3590 expected_uncmp_ofs += uncompressed_size;
3591 expected_cmp_ofs += compressed_size;
3594 if (expected_cmp_ofs != r->ztrailer_ofs)
3596 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3597 "would be expected from block descriptors."),
3598 r->ztrailer_ofs, expected_cmp_ofs);
3602 seek (r, zheader_ofs + 24);
3607 open_zstream (struct sfm_reader *r)
3611 r->zout_pos = r->zout_end = 0;
3612 error = inflateInit (&r->zstream);
3615 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3623 close_zstream (struct sfm_reader *r)
3627 error = inflateEnd (&r->zstream);
3630 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3638 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3640 uint8_t *buf = buf_;
3649 /* Use already inflated data if there is any. */
3650 if (r->zout_pos < r->zout_end)
3652 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3653 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3662 /* We need to inflate some more data.
3663 Get some more input data if we don't have any. */
3664 if (r->zstream.avail_in == 0)
3666 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3671 int retval = try_read_bytes (r, r->zin_buf, n);
3674 r->zstream.avail_in = n;
3675 r->zstream.next_in = r->zin_buf;
3679 /* Inflate the (remaining) input data. */
3680 r->zstream.avail_out = ZOUT_BUF_SIZE;
3681 r->zstream.next_out = r->zout_buf;
3682 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3684 r->zout_end = r->zstream.next_out - r->zout_buf;
3685 if (r->zout_end == 0)
3687 if (error != Z_STREAM_END)
3689 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3693 else if (!close_zstream (r) || !open_zstream (r))
3698 /* Process the output data and ignore 'error' for now. ZLIB will
3699 present it to us again on the next inflate() call. */
3705 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3707 if (r->compression == SFM_COMP_SIMPLE)
3708 return read_bytes (r, buf, byte_cnt);
3711 int retval = read_bytes_zlib (r, buf, byte_cnt);
3713 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3719 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3721 if (r->compression == SFM_COMP_SIMPLE)
3722 return try_read_bytes (r, buf, byte_cnt);
3724 return read_bytes_zlib (r, buf, byte_cnt);
3727 /* Reads a 64-bit floating-point number from R and returns its
3728 value in host format. */
3730 read_compressed_float (struct sfm_reader *r, double *d)
3734 if (!read_compressed_bytes (r, number, sizeof number))
3737 *d = float_get_double (r->float_format, number);
3741 static const struct casereader_class sys_file_casereader_class =
3743 sys_file_casereader_read,
3744 sys_file_casereader_destroy,