1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
28 #include <libpspp/alloc.h>
29 #include <libpspp/assertion.h>
30 #include <libpspp/message.h>
31 #include <libpspp/compiler.h>
32 #include <libpspp/magic.h>
33 #include <libpspp/misc.h>
34 #include <libpspp/str.h>
35 #include <libpspp/hash.h>
36 #include <libpspp/array.h>
38 #include "sys-file-reader.h"
39 #include "sfm-private.h"
41 #include "dictionary.h"
42 #include "file-handle-def.h"
43 #include "file-name.h"
45 #include "value-labels.h"
50 #define _(msgid) gettext (msgid)
52 /* System file reader. */
55 struct file_handle *fh; /* File handle. */
56 FILE *file; /* File stream. */
58 int reverse_endian; /* 1=file has endianness opposite us. */
59 int value_cnt; /* Number of `union values's per case. */
60 long case_cnt; /* Number of cases, -1 if unknown. */
61 int compressed; /* 1=compressed, 0=not compressed. */
62 double bias; /* Compression bias, usually 100.0. */
63 int weight_idx; /* 0-based index of weighting variable, or -1. */
64 bool ok; /* False after an I/O error or corrupt data. */
65 bool has_vls; /* True if the file has one or more Very Long Strings*/
71 /* File's special constants. */
76 /* Decompression buffer. */
77 flt64 *buf; /* Buffer data. */
78 flt64 *ptr; /* Current location in buffer. */
79 flt64 *end; /* End of buffer data. */
81 /* Compression instruction octet. */
82 unsigned char x[8]; /* Current instruction octet. */
83 unsigned char *y; /* Location in current instruction octet. */
86 /* A variable in a system file. */
89 int width; /* 0=numeric, otherwise string width. */
90 int fv; /* Index into case. */
95 /* Swap bytes *A and *B. */
97 bswap (char *a, char *b)
104 /* Reverse the byte order of 32-bit integer *X. */
106 bswap_int32 (int32_t *x_)
108 char *x = (char *) x_;
109 bswap (x + 0, x + 3);
110 bswap (x + 1, x + 2);
113 /* Reverse the byte order of 64-bit floating point *X. */
115 bswap_flt64 (flt64 *x_)
117 char *x = (char *) x_;
118 bswap (x + 0, x + 7);
119 bswap (x + 1, x + 6);
120 bswap (x + 2, x + 5);
121 bswap (x + 3, x + 4);
125 corrupt_msg (int class, const char *format,...)
126 PRINTF_FORMAT (2, 3);
128 /* Displays a corrupt sysfile error. */
130 corrupt_msg (int class, const char *format,...)
136 ds_init_cstr (&text, _("corrupt system file: "));
137 va_start (args, format);
138 ds_put_vformat (&text, format, args);
141 m.category = msg_class_to_category (class);
142 m.severity = msg_class_to_severity (class);
143 m.where.file_name = NULL;
144 m.where.line_number = 0;
145 m.text = ds_cstr (&text);
150 /* Closes a system file after we're done with it. */
152 sfm_close_reader (struct sfm_reader *r)
159 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
160 msg (ME, _("%s: Closing system file: %s."),
161 fh_get_file_name (r->fh), strerror (errno));
166 fh_close (r->fh, "system file", "rs");
173 /* Dictionary reader. */
175 static void buf_unread(struct sfm_reader *r, size_t byte_cnt);
177 static void *buf_read (struct sfm_reader *, void *buf, size_t byte_cnt,
180 static int read_header (struct sfm_reader *,
181 struct dictionary *, struct sfm_read_info *);
182 static int parse_format_spec (struct sfm_reader *, int32_t,
183 struct fmt_spec *, const struct variable *);
184 static int read_value_labels (struct sfm_reader *, struct dictionary *,
185 struct variable **var_by_idx);
186 static int read_variables (struct sfm_reader *,
187 struct dictionary *, struct variable ***var_by_idx);
188 static int read_machine_int32_info (struct sfm_reader *, int size, int count);
189 static int read_machine_flt64_info (struct sfm_reader *, int size, int count);
190 static int read_documents (struct sfm_reader *, struct dictionary *);
192 static int fread_ok (struct sfm_reader *, void *, size_t);
194 /* Displays the message X with corrupt_msg, then jumps to the error
202 /* Calls buf_read with the specified arguments, and jumps to
203 error if the read fails. */
204 #define assertive_buf_read(a,b,c,d) \
206 if (!buf_read (a,b,c,d)) \
218 pair_sn_compare(const void *_p1, const void *_p2, const void *aux UNUSED)
222 const struct name_pair *p1 = _p1;
223 const struct name_pair *p2 = _p2;
225 char buf1[SHORT_NAME_LEN + 1];
226 char buf2[SHORT_NAME_LEN + 1];
228 memset(buf1, 0, SHORT_NAME_LEN + 1);
229 memset(buf2, 0, SHORT_NAME_LEN + 1);
231 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
233 buf1[i] = p1->shortname[i];
234 if ( '\0' == buf1[i])
238 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
240 buf2[i] = p2->shortname[i];
241 if ( '\0' == buf2[i])
245 return strncmp(buf1, buf2, SHORT_NAME_LEN);
249 pair_sn_hash(const void *_p, const void *aux UNUSED)
252 const struct name_pair *p = _p;
253 char buf[SHORT_NAME_LEN + 1];
255 memset(buf, 0, SHORT_NAME_LEN + 1);
256 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
258 buf[i] = p->shortname[i];
263 return hsh_hash_bytes(buf, strlen(buf));
267 pair_sn_free(void *p, const void *aux UNUSED)
274 /* Opens the system file designated by file handle FH for
275 reading. Reads the system file's dictionary into *DICT.
276 If INFO is non-null, then it receives additional info about the
279 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
280 struct sfm_read_info *info)
282 struct sfm_reader *r = NULL;
283 struct variable **var_by_idx = NULL;
285 /* The data in record 7(14) */
286 char *subrec14data = 0;
288 /* A hash table of long variable names indexed by short name */
289 struct hsh_table *short_to_long = NULL;
291 *dict = dict_create ();
292 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
295 /* Create and initialize reader. */
296 r = xmalloc (sizeof *r);
298 r->file = fn_open (fh_get_file_name (fh), "rb");
300 r->reverse_endian = 0;
311 r->sysmis = -FLT64_MAX;
312 r->highest = FLT64_MAX;
313 r->lowest = second_lowest_flt64;
315 r->buf = r->ptr = r->end = NULL;
316 r->y = r->x + sizeof r->x;
318 /* Check that file open succeeded. */
321 msg (ME, _("An error occurred while opening \"%s\" for reading "
322 "as a system file: %s."),
323 fh_get_file_name (r->fh), strerror (errno));
327 /* Read header and variables. */
328 if (!read_header (r, *dict, info) || !read_variables (r, *dict, &var_by_idx))
332 /* Handle weighting. */
333 if (r->weight_idx != -1)
335 struct variable *weight_var;
337 if (r->weight_idx < 0 || r->weight_idx >= r->value_cnt)
338 lose ((ME, _("%s: Index of weighting variable (%d) is not between 0 "
339 "and number of elements per case (%d)."),
340 fh_get_file_name (r->fh), r->weight_idx, r->value_cnt));
343 weight_var = var_by_idx[r->weight_idx];
345 if (weight_var == NULL)
347 _("%s: Weighting variable may not be a continuation of "
348 "a long string variable."), fh_get_file_name (fh)));
349 else if (var_is_alpha (weight_var))
350 lose ((ME, _("%s: Weighting variable may not be a string variable."),
351 fh_get_file_name (fh)));
353 dict_set_weight (*dict, weight_var);
356 dict_set_weight (*dict, NULL);
358 /* Read records of types 3, 4, 6, and 7. */
363 assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
364 if (r->reverse_endian)
365 bswap_int32 (&rec_type);
371 if (!read_value_labels (r, *dict, var_by_idx))
376 lose ((ME, _("%s: Orphaned variable index record (type 4). Type 4 "
377 "records must always immediately follow type 3 "
379 fh_get_file_name (r->fh)));
382 if (!read_documents (r, *dict))
393 } ATTRIBUTE((packed))
399 assertive_buf_read (r, &data, sizeof data, 0);
400 if (r->reverse_endian)
402 bswap_int32 (&data.subtype);
403 bswap_int32 (&data.size);
404 bswap_int32 (&data.count);
406 bytes = data.size * data.count;
408 if (bytes < data.size || bytes < data.count)
409 lose ((ME, "%s: Record type %d subtype %d too large.",
410 fh_get_file_name (r->fh), rec_type, data.subtype));
412 switch (data.subtype)
415 if (!read_machine_int32_info (r, data.size, data.count))
420 if (!read_machine_flt64_info (r, data.size, data.count))
425 case 6: /* ?? Used by SPSS 8.0. */
429 case 11: /* Variable display parameters */
431 const int n_vars = data.count / 3 ;
433 if ( data.count % 3 || n_vars != dict_get_var_cnt(*dict) )
435 msg (MW, _("%s: Invalid subrecord length. "
436 "Record: 7; Subrecord: 11"),
437 fh_get_file_name (r->fh));
442 for ( i = 0 ; i < MIN(n_vars, dict_get_var_cnt(*dict)) ; ++i )
449 } ATTRIBUTE((packed))
454 assertive_buf_read (r, ¶ms, sizeof(params), 0);
456 if ( ! measure_is_valid(params.measure)
458 ! alignment_is_valid(params.align))
461 _("%s: Invalid variable display parameters. Default parameters substituted."),
462 fh_get_file_name(r->fh));
466 v = dict_get_var(*dict, i);
468 var_set_measure (v, params.measure);
469 var_set_display_width (v, params.width);
470 var_set_alignment (v, params.align);
475 case 13: /* SPSS 12.0 Long variable name map */
478 char *save_ptr = NULL;
482 subrec14data = xmalloc (bytes + 1);
483 if (!buf_read (r, subrec14data, bytes, 0))
487 subrec14data[bytes] = '\0';
489 short_to_long = hsh_create(4,
496 for (short_name = strtok_r (subrec14data, "=", &save_ptr), idx = 0;
498 short_name = strtok_r (NULL, "=", &save_ptr), idx++)
500 struct name_pair *pair ;
501 char *long_name = strtok_r (NULL, "\t", &save_ptr);
504 /* Validate long name. */
505 if (long_name == NULL)
507 msg (MW, _("%s: Trailing garbage in long variable "
509 fh_get_file_name (r->fh));
512 if (!var_is_valid_name (long_name, false))
514 msg (MW, _("%s: Long variable mapping to invalid "
515 "variable name `%s'."),
516 fh_get_file_name (r->fh), long_name);
520 /* Find variable using short name. */
521 v = dict_lookup_var (*dict, short_name);
524 msg (MW, _("%s: Long variable mapping for "
525 "nonexistent variable %s."),
526 fh_get_file_name (r->fh), short_name);
530 /* Identify any duplicates. */
531 if ( compare_var_names(short_name, long_name, 0) &&
532 NULL != dict_lookup_var (*dict, long_name))
533 lose ((ME, _("%s: Duplicate long variable name `%s' "
534 "within system file."),
535 fh_get_file_name (r->fh), long_name));
539 Renaming a variable may clear the short
540 name, but we want to retain it, so
541 re-set it explicitly. */
542 dict_rename_var (*dict, v, long_name);
543 var_set_short_name (v, short_name);
545 pair = xmalloc(sizeof *pair);
546 pair->shortname = short_name;
547 pair->longname = long_name;
548 hsh_insert(short_to_long, pair);
550 /* This messes up the processing of subtype 14 (below).
551 I'm not sure if it is needed anyway, so I'm removing it for
552 now. If it's needed, then it will need to be done after all the
553 records have been processed. --- JMD 27 April 2006
556 /* For compatibility, make sure dictionary
557 is in long variable name map order. In
558 the common case, this has no effect,
559 because the dictionary and the long
560 variable name map are already in the
562 dict_reorder_var (*dict, v, idx);
572 bool eq_seen = false;
576 char *buffer = xmalloc (bytes + 1);
577 if (!buf_read (r, buffer, bytes, 0))
582 buffer[bytes] = '\0';
586 /* Note: SPSS v13 terminates this record with 00,
587 whereas SPSS v14 terminates it with 00 09. We must
589 for(i = 0; i < bytes ; ++i)
592 static char name[SHORT_NAME_LEN + 1] = {0};
593 static char len_str[6] ={0};
602 length = strtol(len_str, 0, 10);
603 if ( length != LONG_MAX && length != LONG_MIN)
605 char *lookup_name = name;
612 struct name_pair pair;
615 pair.shortname = name;
616 p = hsh_find(short_to_long, &pair);
618 lookup_name = p->longname;
621 v = dict_lookup_var(*dict, lookup_name);
625 _("%s: No variable called %s but it is listed in length table."),
626 fh_get_file_name (r->fh), lookup_name);
633 if ( var_get_width (v) > EFFECTIVE_LONG_STRING_LENGTH )
634 l -= EFFECTIVE_LONG_STRING_LENGTH;
636 l -= var_get_width (v);
641 struct variable *v_next;
642 v_next = dict_get_var(*dict, idx + 1);
644 if ( var_get_width (v_next) > EFFECTIVE_LONG_STRING_LENGTH )
645 l -= EFFECTIVE_LONG_STRING_LENGTH;
647 l -= var_get_width (v_next);
649 dict_delete_var(*dict, v_next);
652 assert ( length > MAX_LONG_STRING );
654 var_set_width (v, length);
657 memset(name, 0, SHORT_NAME_LEN+1);
658 memset(len_str, 0, 6);
665 len_str[j] = buffer[i];
673 dict_compact_values(*dict);
678 msg (MW, _("%s: Unrecognized record type 7, subtype %d "
679 "encountered in system file."),
680 fh_get_file_name (r->fh), data.subtype);
686 void *x = buf_read (r, NULL, data.size * data.count, 0);
698 assertive_buf_read (r, &filler, sizeof filler, 0);
704 corrupt_msg(MW, _("%s: Unrecognized record type %d."),
705 fh_get_file_name (r->fh), rec_type);
710 /* Come here on successful completion. */
712 /* Create an index of dictionary variable widths for
713 sfm_read_case to use. We cannot use the `struct variables'
714 from the dictionary we created, because the caller owns the
715 dictionary and may destroy or modify its variables. */
719 r->var_cnt = dict_get_var_cnt (*dict);
720 r->vars = xnmalloc (r->var_cnt, sizeof *r->vars);
721 for (i = 0; i < r->var_cnt; i++)
723 struct variable *v = dict_get_var (*dict, i);
724 struct sfm_var *sv = &r->vars[i];
725 sv->width = var_get_width (v);
731 hsh_destroy(short_to_long);
736 /* Come here on unsuccessful completion. */
737 sfm_close_reader (r);
739 hsh_destroy(short_to_long);
743 dict_destroy (*dict);
749 /* Read record type 7, subtype 3. */
751 read_machine_int32_info (struct sfm_reader *r, int size, int count)
758 if (size != sizeof (int32_t) || count != 8)
759 lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
760 "subtype 3. Expected size %d, count 8."),
761 fh_get_file_name (r->fh), size, count, sizeof (int32_t)));
763 assertive_buf_read (r, data, sizeof data, 0);
764 if (r->reverse_endian)
765 for (i = 0; i < 8; i++)
766 bswap_int32 (&data[i]);
770 lose ((ME, _("%s: Floating-point representation in system file is not "
771 "IEEE-754. PSPP cannot convert between floating-point "
773 fh_get_file_name (r->fh)));
775 #error Add support for your floating-point format.
778 #ifdef WORDS_BIGENDIAN
783 if (r->reverse_endian)
785 if (file_bigendian ^ (data[6] == 1))
786 lose ((ME, _("%s: File-indicated endianness (%s) does not match "
787 "endianness intuited from file header (%s)."),
788 fh_get_file_name (r->fh),
789 file_bigendian ? _("big-endian") : _("little-endian"),
790 data[6] == 1 ? _("big-endian") : (data[6] == 2 ? _("little-endian")
793 /* PORTME: Character representation code. */
794 if (data[7] != 2 && data[7] != 3)
795 lose ((ME, _("%s: File-indicated character representation code (%s) is "
797 fh_get_file_name (r->fh),
798 (data[7] == 1 ? "EBCDIC"
799 : (data[7] == 4 ? _("DEC Kanji") : _("Unknown")))));
807 /* Read record type 7, subtype 4. */
809 read_machine_flt64_info (struct sfm_reader *r, int size, int count)
814 if (size != sizeof (flt64) || count != 3)
815 lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
816 "subtype 4. Expected size %d, count 8."),
817 fh_get_file_name (r->fh), size, count, sizeof (flt64)));
819 assertive_buf_read (r, data, sizeof data, 0);
820 if (r->reverse_endian)
821 for (i = 0; i < 3; i++)
822 bswap_flt64 (&data[i]);
824 if (data[0] != SYSMIS || data[1] != FLT64_MAX
825 || data[2] != second_lowest_flt64)
828 r->highest = data[1];
830 msg (MW, _("%s: File-indicated value is different from internal value "
831 "for at least one of the three system values. SYSMIS: "
832 "indicated %g, expected %g; HIGHEST: %g, %g; LOWEST: "
834 fh_get_file_name (r->fh), (double) data[0], (double) SYSMIS,
835 (double) data[1], (double) FLT64_MAX,
836 (double) data[2], (double) second_lowest_flt64);
846 read_header (struct sfm_reader *r,
847 struct dictionary *dict, struct sfm_read_info *info)
849 struct sysfile_header hdr; /* Disk buffer. */
850 char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */
851 int skip_amt = 0; /* Amount of product name to omit. */
854 /* Read header, check magic. */
855 assertive_buf_read (r, &hdr, sizeof hdr, 0);
856 if (strncmp ("$FL2", hdr.rec_type, 4) != 0)
857 lose ((ME, _("%s: Bad magic. Proper system files begin with "
858 "the four characters `$FL2'. This file will not be read."),
859 fh_get_file_name (r->fh)));
861 /* Check eye-category.her string. */
862 memcpy (prod_name, hdr.prod_name, sizeof hdr.prod_name);
863 for (i = 0; i < 60; i++)
864 if (!c_isprint ((unsigned char) prod_name[i]))
866 for (i = 59; i >= 0; i--)
867 if (!c_isgraph ((unsigned char) prod_name[i]))
872 prod_name[60] = '\0';
876 static const char *prefix[N_PREFIXES] =
878 "@(#) SPSS DATA FILE",
884 for (i = 0; i < N_PREFIXES; i++)
885 if (!strncmp (prefix[i], hdr.prod_name, strlen (prefix[i])))
887 skip_amt = strlen (prefix[i]);
892 /* Check endianness. */
893 if (hdr.layout_code == 2)
894 r->reverse_endian = 0;
897 bswap_int32 (&hdr.layout_code);
898 if (hdr.layout_code != 2)
899 lose ((ME, _("%s: File layout code has unexpected value %d. Value "
900 "should be 2, in big-endian or little-endian format."),
901 fh_get_file_name (r->fh), hdr.layout_code));
903 r->reverse_endian = 1;
904 bswap_int32 (&hdr.nominal_case_size);
905 bswap_int32 (&hdr.compress);
906 bswap_int32 (&hdr.weight_idx);
907 bswap_int32 (&hdr.case_cnt);
908 bswap_flt64 (&hdr.bias);
912 /* Copy basic info and verify correctness. */
913 r->value_cnt = hdr.nominal_case_size;
915 /* If value count is ridiculous, then force it to -1 (a
917 if ( r->value_cnt < 0 ||
918 r->value_cnt > (INT_MAX / (int) sizeof (union value) / 2))
921 r->compressed = hdr.compress;
923 r->weight_idx = hdr.weight_idx - 1;
925 r->case_cnt = hdr.case_cnt;
926 if (r->case_cnt < -1 || r->case_cnt > INT_MAX / 2)
928 _("%s: Number of cases in file (%ld) is not between -1 and %d."),
929 fh_get_file_name (r->fh), (long) r->case_cnt, INT_MAX / 2));
932 if (r->bias != 100.0)
933 corrupt_msg (MW, _("%s: Compression bias (%g) is not the usual "
935 fh_get_file_name (r->fh), r->bias);
937 /* Make a file label only on the condition that the given label is
938 not all spaces or nulls. */
942 for (i = sizeof hdr.file_label - 1; i >= 0; i--)
944 if (!c_isspace ((unsigned char) hdr.file_label[i])
945 && hdr.file_label[i] != 0)
947 char *label = xmalloc (i + 2);
948 memcpy (label, hdr.file_label, i + 1);
950 dict_set_label (dict, label);
961 memcpy (info->creation_date, hdr.creation_date, 9);
962 info->creation_date[9] = 0;
964 memcpy (info->creation_time, hdr.creation_time, 8);
965 info->creation_time[8] = 0;
967 #ifdef WORDS_BIGENDIAN
968 info->big_endian = !r->reverse_endian;
970 info->big_endian = r->reverse_endian;
973 info->compressed = hdr.compress;
975 info->case_cnt = hdr.case_cnt;
977 for (cp = &prod_name[skip_amt]; cp < &prod_name[60]; cp++)
978 if (c_isgraph ((unsigned char) *cp))
980 strcpy (info->product, cp);
989 /* Reads most of the dictionary from file H; also fills in the
990 associated VAR_BY_IDX array. */
992 read_variables (struct sfm_reader *r,
993 struct dictionary *dict, struct variable ***var_by_idx)
997 struct sysfile_variable sv; /* Disk buffer. */
998 int long_string_count = 0; /* # of long string continuation
999 records still expected. */
1000 int next_value = 0; /* Index to next `value' structure. */
1007 /* Read in the entry for each variable and use the info to
1008 initialize the dictionary. */
1011 struct variable *vv;
1012 char name[SHORT_NAME_LEN + 1];
1015 struct fmt_spec print, write;
1018 assertive_buf_read (r, &sv, sizeof sv, 0);
1020 if (r->reverse_endian)
1022 bswap_int32 (&sv.rec_type);
1023 bswap_int32 (&sv.type);
1024 bswap_int32 (&sv.has_var_label);
1025 bswap_int32 (&sv.n_missing_values);
1026 bswap_int32 (&sv.print);
1027 bswap_int32 (&sv.write);
1030 /* We've come to the end of the variable entries */
1031 if (sv.rec_type != 2)
1033 buf_unread(r, sizeof sv);
1038 *var_by_idx = xnrealloc (*var_by_idx, i + 1, sizeof **var_by_idx);
1040 /* If there was a long string previously, make sure that the
1041 continuations are present; otherwise make sure there aren't
1043 if (long_string_count)
1046 lose ((ME, _("%s: position %d: String variable does not have "
1047 "proper number of continuation records."),
1048 fh_get_file_name (r->fh), i));
1051 (*var_by_idx)[i] = NULL;
1052 long_string_count--;
1055 else if (sv.type == -1)
1056 lose ((ME, _("%s: position %d: Superfluous long string continuation "
1058 fh_get_file_name (r->fh), i));
1060 /* Check fields for validity. */
1061 if (sv.type < 0 || sv.type > 255)
1062 lose ((ME, _("%s: position %d: Bad variable type code %d."),
1063 fh_get_file_name (r->fh), i, sv.type));
1064 if (sv.has_var_label != 0 && sv.has_var_label != 1)
1065 lose ((ME, _("%s: position %d: Variable label indicator field is not "
1066 "0 or 1."), fh_get_file_name (r->fh), i));
1067 if (sv.n_missing_values < -3 || sv.n_missing_values > 3
1068 || sv.n_missing_values == -1)
1069 lose ((ME, _("%s: position %d: Missing value indicator field is not "
1070 "-3, -2, 0, 1, 2, or 3."), fh_get_file_name (r->fh), i));
1072 /* Copy first character of variable name. */
1073 if (sv.name[0] == '@' || sv.name[0] == '#')
1074 lose ((ME, _("%s: position %d: Variable name begins with invalid "
1076 fh_get_file_name (r->fh), i));
1078 name[0] = sv.name[0];
1080 /* Copy remaining characters of variable name. */
1081 for (j = 1; j < SHORT_NAME_LEN; j++)
1083 int c = (unsigned char) sv.name[j];
1092 if ( ! var_is_plausible_name(name, false) )
1093 lose ((ME, _("%s: Invalid variable name `%s' within system file."),
1094 fh_get_file_name (r->fh), name));
1096 /* Create variable. */
1097 vv = (*var_by_idx)[i] = dict_create_var (dict, name, sv.type);
1099 lose ((ME, _("%s: Duplicate variable name `%s' within system file."),
1100 fh_get_file_name (r->fh), name));
1102 /* Set the short name the same as the long name */
1103 var_set_short_name (vv, var_get_name (vv));
1105 /* Case reading data. */
1106 nv = sv.type == 0 ? 1 : DIV_RND_UP (sv.type, sizeof (flt64));
1107 long_string_count = nv - 1;
1110 /* Get variable label, if any. */
1111 if (sv.has_var_label == 1)
1116 /* Read length of label. */
1117 assertive_buf_read (r, &len, sizeof len, 0);
1118 if (r->reverse_endian)
1122 if (len < 0 || len > 255)
1123 lose ((ME, _("%s: Variable %s indicates variable label of invalid "
1125 fh_get_file_name (r->fh), var_get_name (vv), len));
1129 /* Read label into variable structure. */
1131 assertive_buf_read (r, label, ROUND_UP (len, sizeof (int32_t)),
1134 var_set_label (vv, label);
1138 /* Set missing values. */
1139 if (sv.n_missing_values != 0)
1142 int mv_cnt = abs (sv.n_missing_values);
1143 struct missing_values miss;
1145 if (var_get_width (vv) > MAX_SHORT_STRING)
1146 lose ((ME, _("%s: Long string variable %s may not have missing "
1148 fh_get_file_name (r->fh), var_get_name (vv)));
1149 mv_init (&miss, var_get_width (vv));
1151 assertive_buf_read (r, mv, sizeof *mv * mv_cnt, 0);
1153 if (r->reverse_endian && var_is_numeric (vv))
1154 for (j = 0; j < mv_cnt; j++)
1155 bswap_flt64 (&mv[j]);
1157 if (sv.n_missing_values > 0)
1159 for (j = 0; j < sv.n_missing_values; j++)
1160 if (var_is_numeric (vv))
1161 mv_add_num (&miss, mv[j]);
1163 mv_add_str (&miss, (char *) &mv[j]);
1167 if (var_is_alpha (vv))
1168 lose ((ME, _("%s: String variable %s may not have missing "
1169 "values specified as a range."),
1170 fh_get_file_name (r->fh), var_get_name (vv)));
1172 if (mv[0] == r->lowest)
1173 mv_add_num_range (&miss, LOWEST, mv[1]);
1174 else if (mv[1] == r->highest)
1175 mv_add_num_range (&miss, mv[0], HIGHEST);
1177 mv_add_num_range (&miss, mv[0], mv[1]);
1179 if (sv.n_missing_values == -3)
1180 mv_add_num (&miss, mv[2]);
1182 var_set_missing_values (vv, &miss);
1185 if (!parse_format_spec (r, sv.print, &print, vv)
1186 || !parse_format_spec (r, sv.write, &write, vv))
1189 var_set_print_format (vv, &print);
1190 var_set_write_format (vv, &write);
1193 /* Some consistency checks. */
1194 if (long_string_count != 0)
1195 lose ((ME, _("%s: Long string continuation records omitted at end of "
1197 fh_get_file_name (r->fh)));
1199 if (next_value != r->value_cnt)
1200 corrupt_msg(MW, _("%s: System file header indicates %d variable positions but "
1201 "%d were read from file."),
1202 fh_get_file_name (r->fh), r->value_cnt, next_value);
1211 /* Translates the format spec from sysfile format to internal
1214 parse_format_spec (struct sfm_reader *r, int32_t s,
1215 struct fmt_spec *f, const struct variable *v)
1219 if (!fmt_from_io ((s >> 16) & 0xff, &f->type))
1220 lose ((ME, _("%s: Bad format specifier byte (%d)."),
1221 fh_get_file_name (r->fh), (s >> 16) & 0xff));
1222 f->w = (s >> 8) & 0xff;
1225 if (var_is_alpha (v) != fmt_is_string (f->type))
1226 lose ((ME, _("%s: %s variable %s has %s format specifier %s."),
1227 fh_get_file_name (r->fh),
1228 var_is_alpha (v) ? _("String") : _("Numeric"),
1230 fmt_is_string (f->type) ? _("string") : _("numeric"),
1231 fmt_name (f->type)));
1234 ok = fmt_check_output (f) && fmt_check_width_compat (f, var_get_width (v));
1239 char fmt_string[FMT_STRING_LEN_MAX + 1];
1240 msg (ME, _("%s variable %s has invalid format specifier %s."),
1241 var_is_numeric (v) ? _("Numeric") : _("String"),
1242 var_get_name (v), fmt_to_string (f, fmt_string));
1243 *f = (var_is_numeric (v)
1244 ? fmt_for_output (FMT_F, 8, 2)
1245 : fmt_for_output (FMT_A, var_get_width (v), 0));
1253 /* Reads value labels from sysfile H and inserts them into the
1254 associated dictionary. */
1256 read_value_labels (struct sfm_reader *r,
1257 struct dictionary *dict, struct variable **var_by_idx)
1261 char raw_value[8]; /* Value as uninterpreted bytes. */
1262 union value value; /* Value. */
1263 char *label; /* Null-terminated label string. */
1266 struct label *labels = NULL;
1267 int32_t n_labels; /* Number of labels. */
1269 struct variable **var = NULL; /* Associated variables. */
1270 int32_t n_vars; /* Number of associated variables. */
1274 /* First step: read the contents of the type 3 record and record its
1275 contents. Note that we can't do much with the data since we
1276 don't know yet whether it is of numeric or string type. */
1278 /* Read number of labels. */
1279 assertive_buf_read (r, &n_labels, sizeof n_labels, 0);
1280 if (r->reverse_endian)
1281 bswap_int32 (&n_labels);
1283 if ( n_labels >= ((int32_t) ~0) / sizeof *labels)
1285 corrupt_msg(MW, _("%s: Invalid number of labels: %d. Ignoring labels."),
1286 fh_get_file_name (r->fh), n_labels);
1290 /* Allocate memory. */
1291 labels = xcalloc (n_labels, sizeof *labels);
1292 for (i = 0; i < n_labels; i++)
1293 labels[i].label = NULL;
1295 /* Read each value/label tuple into labels[]. */
1296 for (i = 0; i < n_labels; i++)
1298 struct label *label = labels + i;
1299 unsigned char label_len;
1303 assertive_buf_read (r, label->raw_value, sizeof label->raw_value, 0);
1305 /* Read label length. */
1306 assertive_buf_read (r, &label_len, sizeof label_len, 0);
1307 padded_len = ROUND_UP (label_len + 1, sizeof (flt64));
1309 /* Read label, padding. */
1310 label->label = xmalloc (padded_len + 1);
1311 assertive_buf_read (r, label->label, padded_len - 1, 0);
1312 label->label[label_len] = 0;
1315 /* Second step: Read the type 4 record that has the list of
1316 variables to which the value labels are to be applied. */
1318 /* Read record type of type 4 record. */
1322 assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
1323 if (r->reverse_endian)
1324 bswap_int32 (&rec_type);
1327 lose ((ME, _("%s: Variable index record (type 4) does not immediately "
1328 "follow value label record (type 3) as it should."),
1329 fh_get_file_name (r->fh)));
1332 /* Read number of variables associated with value label from type 4
1334 assertive_buf_read (r, &n_vars, sizeof n_vars, 0);
1335 if (r->reverse_endian)
1336 bswap_int32 (&n_vars);
1337 if (n_vars < 1 || n_vars > dict_get_var_cnt (dict))
1338 lose ((ME, _("%s: Number of variables associated with a value label (%d) "
1339 "is not between 1 and the number of variables (%d)."),
1340 fh_get_file_name (r->fh), n_vars, dict_get_var_cnt (dict)));
1342 /* Read the list of variables. */
1343 var = xnmalloc (n_vars, sizeof *var);
1344 for (i = 0; i < n_vars; i++)
1349 /* Read variable index, check range. */
1350 assertive_buf_read (r, &var_idx, sizeof var_idx, 0);
1351 if (r->reverse_endian)
1352 bswap_int32 (&var_idx);
1353 if (var_idx < 1 || var_idx > r->value_cnt)
1354 lose ((ME, _("%s: Variable index associated with value label (%d) is "
1355 "not between 1 and the number of values (%d)."),
1356 fh_get_file_name (r->fh), var_idx, r->value_cnt));
1358 /* Make sure it's a real variable. */
1359 v = var_by_idx[var_idx - 1];
1361 lose ((ME, _("%s: Variable index associated with value label (%d) "
1362 "refers to a continuation of a string variable, not to "
1363 "an actual variable."),
1364 fh_get_file_name (r->fh), var_idx));
1365 if (var_is_long_string (v))
1366 lose ((ME, _("%s: Value labels are not allowed on long string "
1368 fh_get_file_name (r->fh), var_get_name (v)));
1370 /* Add it to the list of variables. */
1374 /* Type check the variables. */
1375 for (i = 1; i < n_vars; i++)
1376 if (var_get_type (var[i]) != var_get_type (var[0]))
1377 lose ((ME, _("%s: Variables associated with value label are not all of "
1378 "identical type. Variable %s has %s type, but variable "
1380 fh_get_file_name (r->fh),
1381 var_get_name (var[0]),
1382 var_is_alpha (var[0]) ? _("string") : _("numeric"),
1383 var_get_name (var[i]),
1384 var_is_alpha (var[i]) ? _("string") : _("numeric")));
1386 /* Fill in labels[].value, now that we know the desired type. */
1387 for (i = 0; i < n_labels; i++)
1389 struct label *label = labels + i;
1391 if (var_is_alpha (var[0]))
1393 const int copy_len = MIN (sizeof label->raw_value,
1394 sizeof label->label);
1395 memcpy (label->value.s, label->raw_value, copy_len);
1398 assert (sizeof f == sizeof label->raw_value);
1399 memcpy (&f, label->raw_value, sizeof f);
1400 if (r->reverse_endian)
1406 /* Assign the value_label's to each variable. */
1407 for (i = 0; i < n_vars; i++)
1409 struct variable *v = var[i];
1412 /* Add each label to the variable. */
1413 for (j = 0; j < n_labels; j++)
1415 struct label *label = labels + j;
1416 if (!val_labs_replace (v->val_labs, label->value, label->label))
1419 if (var_is_numeric (var[0]))
1420 msg (MW, _("%s: File contains duplicate label for value %g for "
1422 fh_get_file_name (r->fh), label->value.f, var_get_name (v));
1424 msg (MW, _("%s: File contains duplicate label for value `%.*s' "
1425 "for variable %s."),
1426 fh_get_file_name (r->fh), var_get_width (v),
1427 label->value.s, var_get_name (v));
1431 for (i = 0; i < n_labels; i++)
1432 free (labels[i].label);
1440 for (i = 0; i < n_labels; i++)
1441 free (labels[i].label);
1448 /* Reads BYTE_CNT bytes from the file represented by H. If BUF is
1449 non-NULL, uses that as the buffer; otherwise allocates at least
1450 MIN_ALLOC bytes. Returns a pointer to the buffer on success, NULL
1453 buf_read (struct sfm_reader *r, void *buf, size_t byte_cnt, size_t min_alloc)
1457 if (buf == NULL && byte_cnt > 0 )
1458 buf = xmalloc (MAX (byte_cnt, min_alloc));
1460 if ( byte_cnt == 0 )
1464 if (1 != fread (buf, byte_cnt, 1, r->file))
1466 if (ferror (r->file))
1467 msg (ME, _("%s: Reading system file: %s."),
1468 fh_get_file_name (r->fh), strerror (errno));
1470 corrupt_msg (ME, _("%s: Unexpected end of file."),
1471 fh_get_file_name (r->fh));
1479 /* Winds the reader BYTE_CNT bytes back in the reader stream. */
1481 buf_unread(struct sfm_reader *r, size_t byte_cnt)
1483 assert(byte_cnt > 0);
1485 if ( 0 != fseek(r->file, -byte_cnt, SEEK_CUR))
1487 msg (ME, _("%s: Seeking system file: %s."),
1488 fh_get_file_name (r->fh), strerror (errno));
1492 /* Reads a document record, type 6, from system file R, and sets up
1493 the documents and n_documents fields in the associated
1496 read_documents (struct sfm_reader *r, struct dictionary *dict)
1501 if (dict_get_documents (dict) != NULL)
1502 lose ((ME, _("%s: System file contains multiple "
1503 "type 6 (document) records."),
1504 fh_get_file_name (r->fh)));
1506 assertive_buf_read (r, &line_cnt, sizeof line_cnt, 0);
1508 lose ((ME, _("%s: Number of document lines (%ld) "
1509 "must be greater than 0."),
1510 fh_get_file_name (r->fh), (long) line_cnt));
1512 documents = buf_read (r, NULL, 80 * line_cnt, line_cnt * 80 + 1);
1513 /* FIXME? Run through asciify. */
1514 if (documents == NULL)
1516 documents[80 * line_cnt] = '\0';
1517 dict_set_documents (dict, documents);
1527 /* Reads compressed data into H->BUF and sets other pointers
1528 appropriately. Returns nonzero only if both no errors occur and
1531 buffer_input (struct sfm_reader *r)
1538 r->buf = xnmalloc (128, sizeof *r->buf);
1539 amt = fread (r->buf, sizeof *r->buf, 128, r->file);
1540 if (ferror (r->file))
1542 msg (ME, _("%s: Error reading file: %s."),
1543 fh_get_file_name (r->fh), strerror (errno));
1548 r->end = &r->buf[amt];
1552 /* Reads a single case consisting of compressed data from system
1553 file H into the array BUF[] according to reader R, and
1554 returns nonzero only if successful. */
1555 /* Data in system files is compressed in this manner. Data
1556 values are grouped into sets of eight ("octets"). Each value
1557 in an octet has one instruction byte that are output together.
1558 Each instruction byte gives a value for that byte or indicates
1559 that the value can be found following the instructions. */
1561 read_compressed_data (struct sfm_reader *r, flt64 *buf)
1563 const unsigned char *p_end = r->x + sizeof (flt64);
1564 unsigned char *p = r->y;
1566 const flt64 *buf_beg = buf;
1567 const flt64 *buf_end = &buf[r->value_cnt];
1571 for (; p < p_end; p++){
1575 /* Code 0 is ignored. */
1578 /* Code 252 is end of file. */
1581 lose ((ME, _("%s: Compressed data is corrupted. Data ends "
1582 "in partial case."),
1583 fh_get_file_name (r->fh)));
1585 /* Code 253 indicates that the value is stored explicitly
1586 following the instruction bytes. */
1587 if (r->ptr == NULL || r->ptr >= r->end)
1588 if (!buffer_input (r))
1589 lose ((ME, _("%s: Unexpected end of file."),
1590 fh_get_file_name (r->fh)));
1591 memcpy (buf++, r->ptr++, sizeof *buf);
1596 /* Code 254 indicates a string that is all blanks. */
1597 memset (buf++, ' ', sizeof *buf);
1602 /* Code 255 indicates the system-missing value. */
1604 if (r->reverse_endian)
1611 /* Codes 1 through 251 inclusive are taken to indicate a
1612 value of (BYTE - BIAS), where BYTE is the byte's value
1613 and BIAS is the compression bias (generally 100.0). */
1614 *buf = *p - r->bias;
1615 if (r->reverse_endian)
1623 /* We have reached the end of this instruction octet. Read
1625 if (r->ptr == NULL || r->ptr >= r->end)
1627 if (!buffer_input (r))
1630 lose ((ME, _("%s: Unexpected end of file."),
1631 fh_get_file_name (r->fh)));
1636 memcpy (r->x, r->ptr++, sizeof *buf);
1643 /* We have filled up an entire record. Update state and return
1654 /* Reads one case from READER's file into C. Returns nonzero
1655 only if successful. */
1657 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1662 if (!r->compressed && sizeof (flt64) == sizeof (double) && ! r->has_vls)
1664 /* Fast path: external and internal representations are the
1665 same, except possibly for endianness or SYSMIS. Read
1666 directly into the case's buffer, then fix up any minor
1667 details as needed. */
1668 if (!fread_ok (r, case_data_all_rw (c),
1669 sizeof (union value) * r->value_cnt))
1672 /* Fix up endianness if needed. */
1673 if (r->reverse_endian)
1677 for (i = 0; i < r->var_cnt; i++)
1678 if (r->vars[i].width == 0)
1679 bswap_flt64 (&case_data_rw (c, r->vars[i].fv)->f);
1682 /* Fix up SYSMIS values if needed.
1683 I don't think this will ever actually kick in, but it
1685 if (r->sysmis != SYSMIS)
1689 for (i = 0; i < r->var_cnt; i++)
1690 if (r->vars[i].width == 0 && case_num (c, i) == r->sysmis)
1691 case_data_rw (c, r->vars[i].fv)->f = SYSMIS;
1696 /* Slow path: internal and external representations differ.
1697 Read into a bounce buffer, then copy to C. */
1704 bounce_size = sizeof *bounce * r->value_cnt;
1705 bounce = bounce_cur = local_alloc (bounce_size);
1707 memset(bounce, 0, bounce_size);
1710 read_ok = fread_ok (r, bounce, bounce_size);
1712 read_ok = read_compressed_data (r, bounce);
1715 local_free (bounce);
1719 for (i = 0; i < r->var_cnt; i++)
1721 struct sfm_var *sv = &r->vars[i];
1725 flt64 f = *bounce_cur++;
1726 if (r->reverse_endian)
1728 case_data_rw (c, sv->fv)->f = f == r->sysmis ? SYSMIS : f;
1732 flt64 *bc_start = bounce_cur;
1734 while (ofs < sv->width )
1736 const int chunk = MIN (MAX_LONG_STRING, sv->width - ofs);
1737 memcpy (case_data_rw (c, sv->fv)->s + ofs, bounce_cur, chunk);
1739 bounce_cur += DIV_RND_UP (chunk, sizeof (flt64));
1743 bounce_cur = bc_start + width_to_bytes(sv->width) / sizeof(flt64);
1747 local_free (bounce);
1753 fread_ok (struct sfm_reader *r, void *buffer, size_t byte_cnt)
1755 size_t read_bytes = fread (buffer, 1, byte_cnt, r->file);
1757 if (read_bytes == byte_cnt)
1761 if (ferror (r->file))
1763 msg (ME, _("%s: Reading system file: %s."),
1764 fh_get_file_name (r->fh), strerror (errno));
1767 else if (read_bytes != 0)
1769 msg (ME, _("%s: Partial record at end of system file."),
1770 fh_get_file_name (r->fh));
1777 /* Returns true if an I/O error has occurred on READER, false
1780 sfm_read_error (const struct sfm_reader *reader)
1785 /* Returns true if FILE is an SPSS system file,
1788 sfm_detect (FILE *file)
1790 struct sysfile_header hdr;
1792 if (fread (&hdr, sizeof hdr, 1, file) != 1)
1794 if (strncmp ("$FL2", hdr.rec_type, 4))