1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
28 #include <libpspp/alloc.h>
29 #include <libpspp/message.h>
30 #include <libpspp/compiler.h>
31 #include <libpspp/magic.h>
32 #include <libpspp/misc.h>
33 #include <libpspp/str.h>
34 #include <libpspp/hash.h>
35 #include <libpspp/array.h>
37 #include "sys-file-reader.h"
38 #include "sfm-private.h"
40 #include "dictionary.h"
41 #include "file-handle-def.h"
42 #include "file-name.h"
44 #include "value-labels.h"
49 #define _(msgid) gettext (msgid)
51 /* System file reader. */
54 struct file_handle *fh; /* File handle. */
55 FILE *file; /* File stream. */
57 int reverse_endian; /* 1=file has endianness opposite us. */
58 int fix_specials; /* 1=SYSMIS/HIGHEST/LOWEST differs from us. */
59 int value_cnt; /* Number of `union values's per case. */
60 long case_cnt; /* Number of cases, -1 if unknown. */
61 int compressed; /* 1=compressed, 0=not compressed. */
62 double bias; /* Compression bias, usually 100.0. */
63 int weight_idx; /* 0-based index of weighting variable, or -1. */
64 bool ok; /* False after an I/O error or corrupt data. */
65 bool has_vls; /* True if the file has one or more Very Long Strings*/
68 struct hsh_table *var_hash;
69 struct variable **svars;
71 /* File's special constants. */
76 /* Decompression buffer. */
77 flt64 *buf; /* Buffer data. */
78 flt64 *ptr; /* Current location in buffer. */
79 flt64 *end; /* End of buffer data. */
81 /* Compression instruction octet. */
82 unsigned char x[8]; /* Current instruction octet. */
83 unsigned char *y; /* Location in current instruction octet. */
86 /* A variable in a system file. */
89 char name[SHORT_NAME_LEN + 1]; /* name */
90 int width; /* 0=numeric, otherwise string width. */
91 int fv; /* Index into case. */
96 /* Swap bytes *A and *B. */
98 bswap (char *a, char *b)
105 /* Reverse the byte order of 32-bit integer *X. */
107 bswap_int32 (int32_t *x_)
109 char *x = (char *) x_;
110 bswap (x + 0, x + 3);
111 bswap (x + 1, x + 2);
114 /* Reverse the byte order of 64-bit floating point *X. */
116 bswap_flt64 (flt64 *x_)
118 char *x = (char *) x_;
119 bswap (x + 0, x + 7);
120 bswap (x + 1, x + 6);
121 bswap (x + 2, x + 5);
122 bswap (x + 3, x + 4);
126 corrupt_msg (int class, const char *format,...)
127 PRINTF_FORMAT (2, 3);
129 /* Displays a corrupt sysfile error. */
131 corrupt_msg (int class, const char *format,...)
137 ds_create (&text, _("corrupt system file: "));
138 va_start (args, format);
139 ds_vprintf (&text, format, args);
142 m.category = msg_class_to_category (class);
143 m.severity = msg_class_to_severity (class);
144 m.where.file_name = NULL;
145 m.where.line_number = 0;
146 m.text = ds_c_str (&text);
151 /* Closes a system file after we're done with it. */
153 sfm_close_reader (struct sfm_reader *r)
160 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
161 msg (ME, _("%s: Closing system file: %s."),
162 fh_get_file_name (r->fh), strerror (errno));
167 fh_close (r->fh, "system file", "rs");
169 hsh_destroy(r->var_hash);
174 /* Dictionary reader. */
176 static void buf_unread(struct sfm_reader *r, size_t byte_cnt);
178 static void *buf_read (struct sfm_reader *, void *buf, size_t byte_cnt,
181 static int read_header (struct sfm_reader *,
182 struct dictionary *, struct sfm_read_info *);
183 static int parse_format_spec (struct sfm_reader *, int32_t,
184 struct fmt_spec *, const struct variable *);
185 static int read_value_labels (struct sfm_reader *, struct dictionary *,
186 struct variable **var_by_idx);
187 static int read_variables (struct sfm_reader *,
188 struct dictionary *, struct variable ***var_by_idx);
189 static int read_machine_int32_info (struct sfm_reader *, int size, int count);
190 static int read_machine_flt64_info (struct sfm_reader *, int size, int count);
191 static int read_documents (struct sfm_reader *, struct dictionary *);
193 static int fread_ok (struct sfm_reader *, void *, size_t);
195 /* Displays the message X with corrupt_msg, then jumps to the error
203 /* Calls buf_read with the specified arguments, and jumps to
204 error if the read fails. */
205 #define assertive_buf_read(a,b,c,d) \
207 if (!buf_read (a,b,c,d)) \
219 pair_sn_compare(const void *_p1, const void *_p2, void *aux UNUSED)
223 const struct name_pair *p1 = _p1;
224 const struct name_pair *p2 = _p2;
226 char buf1[SHORT_NAME_LEN + 1];
227 char buf2[SHORT_NAME_LEN + 1];
229 memset(buf1, 0, SHORT_NAME_LEN + 1);
230 memset(buf2, 0, SHORT_NAME_LEN + 1);
232 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
234 buf1[i] = p1->shortname[i];
235 if ( '\0' == buf1[i])
239 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
241 buf2[i] = p2->shortname[i];
242 if ( '\0' == buf2[i])
246 return strncmp(buf1, buf2, SHORT_NAME_LEN);
250 pair_sn_hash(const void *_p, void *aux UNUSED)
253 const struct name_pair *p = _p;
254 char buf[SHORT_NAME_LEN + 1];
256 memset(buf, 0, SHORT_NAME_LEN + 1);
257 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
259 buf[i] = p->shortname[i];
264 return hsh_hash_bytes(buf, strlen(buf));
268 pair_sn_free(void *p, void *aux UNUSED)
275 /* A hsh_compare_func that orders variables A and B by their
278 compare_var_shortnames (const void *a_, const void *b_, void *foo UNUSED)
281 const struct variable *a = a_;
282 const struct variable *b = b_;
284 char buf1[SHORT_NAME_LEN + 1];
285 char buf2[SHORT_NAME_LEN + 1];
287 memset(buf1, 0, SHORT_NAME_LEN + 1);
288 memset(buf2, 0, SHORT_NAME_LEN + 1);
290 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
292 buf1[i] = a->short_name[i];
293 if ( '\0' == buf1[i])
297 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
299 buf2[i] = b->short_name[i];
300 if ( '\0' == buf2[i])
304 return strncmp(buf1, buf2, SHORT_NAME_LEN);
307 /* A hsh_hash_func that hashes variable V based on its name. */
309 hash_var_shortname (const void *v_, void *foo UNUSED)
312 const struct variable *v = v_;
313 char buf[SHORT_NAME_LEN + 1];
315 memset(buf, 0, SHORT_NAME_LEN + 1);
316 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
318 buf[i] = v->short_name[i];
323 return hsh_hash_bytes(buf, strlen(buf));
328 /* Opens the system file designated by file handle FH for
329 reading. Reads the system file's dictionary into *DICT.
330 If INFO is non-null, then it receives additional info about the
333 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
334 struct sfm_read_info *info)
336 struct sfm_reader *r = NULL;
337 struct variable **var_by_idx = NULL;
339 /* The data in record 7(14) */
340 char *subrec14data = 0;
342 /* A hash table of long variable names indexed by short name */
343 struct hsh_table *short_to_long = NULL;
346 *dict = dict_create ();
347 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
350 /* Create and initialize reader. */
351 r = xmalloc (sizeof *r);
353 r->file = fn_open (fh_get_file_name (fh), "rb");
355 r->reverse_endian = 0;
366 r->var_hash = hsh_create(4, compare_var_shortnames, hash_var_shortname, 0, 0);
368 r->sysmis = -FLT64_MAX;
369 r->highest = FLT64_MAX;
370 r->lowest = second_lowest_flt64;
372 r->buf = r->ptr = r->end = NULL;
373 r->y = r->x + sizeof r->x;
375 /* Check that file open succeeded. */
378 msg (ME, _("An error occurred while opening \"%s\" for reading "
379 "as a system file: %s."),
380 fh_get_file_name (r->fh), strerror (errno));
384 /* Read header and variables. */
385 if (!read_header (r, *dict, info) || !read_variables (r, *dict, &var_by_idx))
389 /* Handle weighting. */
390 if (r->weight_idx != -1)
392 struct variable *weight_var;
394 if (r->weight_idx < 0 || r->weight_idx >= r->value_cnt)
395 lose ((ME, _("%s: Index of weighting variable (%d) is not between 0 "
396 "and number of elements per case (%d)."),
397 fh_get_file_name (r->fh), r->weight_idx, r->value_cnt));
400 weight_var = var_by_idx[r->weight_idx];
402 if (weight_var == NULL)
404 _("%s: Weighting variable may not be a continuation of "
405 "a long string variable."), fh_get_file_name (fh)));
406 else if (weight_var->type == ALPHA)
407 lose ((ME, _("%s: Weighting variable may not be a string variable."),
408 fh_get_file_name (fh)));
410 dict_set_weight (*dict, weight_var);
413 dict_set_weight (*dict, NULL);
415 /* Read records of types 3, 4, 6, and 7. */
420 assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
421 if (r->reverse_endian)
422 bswap_int32 (&rec_type);
428 if (!read_value_labels (r, *dict, var_by_idx))
433 lose ((ME, _("%s: Orphaned variable index record (type 4). Type 4 "
434 "records must always immediately follow type 3 "
436 fh_get_file_name (r->fh)));
439 if (!read_documents (r, *dict))
456 assertive_buf_read (r, &data, sizeof data, 0);
457 if (r->reverse_endian)
459 bswap_int32 (&data.subtype);
460 bswap_int32 (&data.size);
461 bswap_int32 (&data.count);
463 bytes = data.size * data.count;
465 if (bytes < data.size || bytes < data.count)
466 lose ((ME, "%s: Record type %d subtype %d too large.",
467 fh_get_file_name (r->fh), rec_type, data.subtype));
469 switch (data.subtype)
472 if (!read_machine_int32_info (r, data.size, data.count))
477 if (!read_machine_flt64_info (r, data.size, data.count))
482 case 6: /* ?? Used by SPSS 8.0. */
486 case 11: /* Variable display parameters */
488 const int n_vars = data.count / 3 ;
490 if ( data.count % 3 || n_vars != dict_get_var_cnt(*dict) )
492 msg (MW, _("%s: Invalid subrecord length. "
493 "Record: 7; Subrecord: 11"),
494 fh_get_file_name (r->fh));
499 for ( i = 0 ; i < min(n_vars, dict_get_var_cnt(*dict)) ; ++i )
511 assertive_buf_read (r, ¶ms, sizeof(params), 0);
513 v = dict_get_var(*dict, i);
515 v->measure = params.measure;
516 v->display_width = params.width;
517 v->alignment = params.align;
522 case 13: /* SPSS 12.0 Long variable name map */
524 char *short_name, *save_ptr;
530 subrec14data = xmalloc (bytes + 1);
531 if (!buf_read (r, subrec14data, bytes, 0))
535 subrec14data[bytes] = '\0';
537 short_to_long = hsh_create(4,
544 for (short_name = strtok_r (subrec14data, "=", &save_ptr), idx = 0;
546 short_name = strtok_r (NULL, "=", &save_ptr), idx++)
548 struct name_pair *pair ;
549 char *long_name = strtok_r (NULL, "\t", &save_ptr);
552 /* Validate long name. */
553 if (long_name == NULL)
555 msg (MW, _("%s: Trailing garbage in long variable "
557 fh_get_file_name (r->fh));
560 if (!var_is_valid_name (long_name, false))
562 msg (MW, _("%s: Long variable mapping to invalid "
563 "variable name `%s'."),
564 fh_get_file_name (r->fh), long_name);
568 /* Find variable using short name. */
569 v = dict_lookup_var (*dict, short_name);
572 msg (MW, _("%s: Long variable mapping for "
573 "nonexistent variable %s."),
574 fh_get_file_name (r->fh), short_name);
578 /* Identify any duplicates. */
579 if ( compare_var_names(short_name, long_name, 0) &&
580 NULL != dict_lookup_var (*dict, long_name))
581 lose ((ME, _("%s: Duplicate long variable name `%s' "
582 "within system file."),
583 fh_get_file_name (r->fh), long_name));
587 Renaming a variable may clear the short
588 name, but we want to retain it, so
589 re-set it explicitly. */
590 dict_rename_var (*dict, v, long_name);
591 var_set_short_name (v, short_name);
593 pair = xmalloc(sizeof *pair);
594 pair->shortname = short_name;
595 pair->longname = long_name;
596 hsh_insert(short_to_long, pair);
598 /* This messes up the processing of subtype 14 (below).
599 I'm not sure if it is needed anyway, so I'm removing it for
600 now. If it's needed, then it will need to be done after all the
601 records have been processed. --- JMD 27 April 2006
604 /* For compatability, make sure dictionary
605 is in long variable name map order. In
606 the common case, this has no effect,
607 because the dictionary and the long
608 variable name map are already in the
610 dict_reorder_var (*dict, v, idx);
620 bool eq_seen = false;
624 char *buffer = xmalloc (bytes + 1);
625 if (!buf_read (r, buffer, bytes, 0))
630 buffer[bytes] = '\0';
633 /* Note: SPSS v13 terminates this record with 00,
634 whereas SPSS v14 terminates it with 00 09. We must
636 for(i = 0; i < bytes ; ++i)
639 static char name[SHORT_NAME_LEN + 1] = {0};
640 static char len_str[6] ={0};
649 length = strtol(len_str, 0, 10);
650 if ( length != LONG_MAX && length != LONG_MIN)
652 char *lookup_name = name;
659 struct name_pair pair;
662 pair.shortname = name;
663 p = hsh_find(short_to_long, &pair);
665 lookup_name = p->longname;
668 v = dict_lookup_var(*dict, lookup_name);
672 _("%s: No variable called %s but it is listed in length table."),
673 fh_get_file_name (r->fh), lookup_name);
680 if ( v->width > EFFECTIVE_LONG_STRING_LENGTH )
681 l -= EFFECTIVE_LONG_STRING_LENGTH;
688 struct variable *v_next;
689 v_next = dict_get_var(*dict, idx + 1);
691 if ( v_next->width > EFFECTIVE_LONG_STRING_LENGTH )
692 l -= EFFECTIVE_LONG_STRING_LENGTH;
696 hsh_delete(r->var_hash, v_next);
698 dict_delete_var(*dict, v_next);
701 assert ( length > MAX_LONG_STRING );
704 v->print.w = v->width;
705 v->write.w = v->width;
706 v->nv = DIV_RND_UP (length, MAX_SHORT_STRING);
709 memset(name, 0, SHORT_NAME_LEN+1);
710 memset(len_str, 0, 6);
717 len_str[j] = buffer[i];
725 dict_compact_values(*dict);
730 msg (MW, _("%s: Unrecognized record type 7, subtype %d "
731 "encountered in system file."),
732 fh_get_file_name (r->fh), data.subtype);
738 void *x = buf_read (r, NULL, data.size * data.count, 0);
750 assertive_buf_read (r, &filler, sizeof filler, 0);
756 corrupt_msg(MW, _("%s: Unrecognized record type %d."),
757 fh_get_file_name (r->fh), rec_type);
762 /* Come here on successful completion. */
765 hsh_destroy(short_to_long);
770 /* Come here on unsuccessful completion. */
771 sfm_close_reader (r);
773 hsh_destroy(short_to_long);
777 dict_destroy (*dict);
783 /* Read record type 7, subtype 3. */
785 read_machine_int32_info (struct sfm_reader *r, int size, int count)
792 if (size != sizeof (int32_t) || count != 8)
793 lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
794 "subtype 3. Expected size %d, count 8."),
795 fh_get_file_name (r->fh), size, count, sizeof (int32_t)));
797 assertive_buf_read (r, data, sizeof data, 0);
798 if (r->reverse_endian)
799 for (i = 0; i < 8; i++)
800 bswap_int32 (&data[i]);
804 lose ((ME, _("%s: Floating-point representation in system file is not "
805 "IEEE-754. PSPP cannot convert between floating-point "
807 fh_get_file_name (r->fh)));
809 #error Add support for your floating-point format.
812 #ifdef WORDS_BIGENDIAN
817 if (r->reverse_endian)
819 if (file_bigendian ^ (data[6] == 1))
820 lose ((ME, _("%s: File-indicated endianness (%s) does not match "
821 "endianness intuited from file header (%s)."),
822 fh_get_file_name (r->fh),
823 file_bigendian ? _("big-endian") : _("little-endian"),
824 data[6] == 1 ? _("big-endian") : (data[6] == 2 ? _("little-endian")
827 /* PORTME: Character representation code. */
828 if (data[7] != 2 && data[7] != 3)
829 lose ((ME, _("%s: File-indicated character representation code (%s) is "
831 fh_get_file_name (r->fh),
832 (data[7] == 1 ? "EBCDIC"
833 : (data[7] == 4 ? _("DEC Kanji") : _("Unknown")))));
841 /* Read record type 7, subtype 4. */
843 read_machine_flt64_info (struct sfm_reader *r, int size, int count)
848 if (size != sizeof (flt64) || count != 3)
849 lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
850 "subtype 4. Expected size %d, count 8."),
851 fh_get_file_name (r->fh), size, count, sizeof (flt64)));
853 assertive_buf_read (r, data, sizeof data, 0);
854 if (r->reverse_endian)
855 for (i = 0; i < 3; i++)
856 bswap_flt64 (&data[i]);
858 if (data[0] != SYSMIS || data[1] != FLT64_MAX
859 || data[2] != second_lowest_flt64)
862 r->highest = data[1];
864 msg (MW, _("%s: File-indicated value is different from internal value "
865 "for at least one of the three system values. SYSMIS: "
866 "indicated %g, expected %g; HIGHEST: %g, %g; LOWEST: "
868 fh_get_file_name (r->fh), (double) data[0], (double) SYSMIS,
869 (double) data[1], (double) FLT64_MAX,
870 (double) data[2], (double) second_lowest_flt64);
880 read_header (struct sfm_reader *r,
881 struct dictionary *dict, struct sfm_read_info *info)
883 struct sysfile_header hdr; /* Disk buffer. */
884 char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */
885 int skip_amt = 0; /* Amount of product name to omit. */
888 /* Read header, check magic. */
889 assertive_buf_read (r, &hdr, sizeof hdr, 0);
890 if (strncmp ("$FL2", hdr.rec_type, 4) != 0)
891 lose ((ME, _("%s: Bad magic. Proper system files begin with "
892 "the four characters `$FL2'. This file will not be read."),
893 fh_get_file_name (r->fh)));
895 /* Check eye-category.her string. */
896 memcpy (prod_name, hdr.prod_name, sizeof hdr.prod_name);
897 for (i = 0; i < 60; i++)
898 if (!c_isprint ((unsigned char) prod_name[i]))
900 for (i = 59; i >= 0; i--)
901 if (!c_isgraph ((unsigned char) prod_name[i]))
906 prod_name[60] = '\0';
910 static const char *prefix[N_PREFIXES] =
912 "@(#) SPSS DATA FILE",
918 for (i = 0; i < N_PREFIXES; i++)
919 if (!strncmp (prefix[i], hdr.prod_name, strlen (prefix[i])))
921 skip_amt = strlen (prefix[i]);
926 /* Check endianness. */
927 if (hdr.layout_code == 2)
928 r->reverse_endian = 0;
931 bswap_int32 (&hdr.layout_code);
932 if (hdr.layout_code != 2)
933 lose ((ME, _("%s: File layout code has unexpected value %d. Value "
934 "should be 2, in big-endian or little-endian format."),
935 fh_get_file_name (r->fh), hdr.layout_code));
937 r->reverse_endian = 1;
938 bswap_int32 (&hdr.nominal_case_size);
939 bswap_int32 (&hdr.compress);
940 bswap_int32 (&hdr.weight_idx);
941 bswap_int32 (&hdr.case_cnt);
942 bswap_flt64 (&hdr.bias);
946 /* Copy basic info and verify correctness. */
947 r->value_cnt = hdr.nominal_case_size;
949 /* If value count is ridiculous, then force it to -1 (a
951 if ( r->value_cnt < 0 ||
952 r->value_cnt > (INT_MAX / (int) sizeof (union value) / 2))
955 r->compressed = hdr.compress;
957 r->weight_idx = hdr.weight_idx - 1;
959 r->case_cnt = hdr.case_cnt;
960 if (r->case_cnt < -1 || r->case_cnt > INT_MAX / 2)
962 _("%s: Number of cases in file (%ld) is not between -1 and %d."),
963 fh_get_file_name (r->fh), (long) r->case_cnt, INT_MAX / 2));
966 if (r->bias != 100.0)
967 corrupt_msg (MW, _("%s: Compression bias (%g) is not the usual "
969 fh_get_file_name (r->fh), r->bias);
971 /* Make a file label only on the condition that the given label is
972 not all spaces or nulls. */
976 for (i = sizeof hdr.file_label - 1; i >= 0; i--)
978 if (!c_isspace ((unsigned char) hdr.file_label[i])
979 && hdr.file_label[i] != 0)
981 char *label = xmalloc (i + 2);
982 memcpy (label, hdr.file_label, i + 1);
984 dict_set_label (dict, label);
995 memcpy (info->creation_date, hdr.creation_date, 9);
996 info->creation_date[9] = 0;
998 memcpy (info->creation_time, hdr.creation_time, 8);
999 info->creation_time[8] = 0;
1001 #ifdef WORDS_BIGENDIAN
1002 info->big_endian = !r->reverse_endian;
1004 info->big_endian = r->reverse_endian;
1007 info->compressed = hdr.compress;
1009 info->case_cnt = hdr.case_cnt;
1011 for (cp = &prod_name[skip_amt]; cp < &prod_name[60]; cp++)
1012 if (c_isgraph ((unsigned char) *cp))
1014 strcpy (info->product, cp);
1023 /* Reads most of the dictionary from file H; also fills in the
1024 associated VAR_BY_IDX array. */
1026 read_variables (struct sfm_reader *r,
1027 struct dictionary *dict, struct variable ***var_by_idx)
1031 struct sysfile_variable sv; /* Disk buffer. */
1032 int long_string_count = 0; /* # of long string continuation
1033 records still expected. */
1034 int next_value = 0; /* Index to next `value' structure. */
1041 /* Read in the entry for each variable and use the info to
1042 initialize the dictionary. */
1045 struct variable *vv;
1046 char name[SHORT_NAME_LEN + 1];
1050 assertive_buf_read (r, &sv, sizeof sv, 0);
1052 if (r->reverse_endian)
1054 bswap_int32 (&sv.rec_type);
1055 bswap_int32 (&sv.type);
1056 bswap_int32 (&sv.has_var_label);
1057 bswap_int32 (&sv.n_missing_values);
1058 bswap_int32 (&sv.print);
1059 bswap_int32 (&sv.write);
1062 /* We've come to the end of the variable entries */
1063 if (sv.rec_type != 2)
1065 buf_unread(r, sizeof sv);
1070 *var_by_idx = xnrealloc (*var_by_idx, i + 1, sizeof **var_by_idx);
1072 /* If there was a long string previously, make sure that the
1073 continuations are present; otherwise make sure there aren't
1075 if (long_string_count)
1078 lose ((ME, _("%s: position %d: String variable does not have "
1079 "proper number of continuation records."),
1080 fh_get_file_name (r->fh), i));
1083 (*var_by_idx)[i] = NULL;
1084 long_string_count--;
1087 else if (sv.type == -1)
1088 lose ((ME, _("%s: position %d: Superfluous long string continuation "
1090 fh_get_file_name (r->fh), i));
1092 /* Check fields for validity. */
1093 if (sv.type < 0 || sv.type > 255)
1094 lose ((ME, _("%s: position %d: Bad variable type code %d."),
1095 fh_get_file_name (r->fh), i, sv.type));
1096 if (sv.has_var_label != 0 && sv.has_var_label != 1)
1097 lose ((ME, _("%s: position %d: Variable label indicator field is not "
1098 "0 or 1."), fh_get_file_name (r->fh), i));
1099 if (sv.n_missing_values < -3 || sv.n_missing_values > 3
1100 || sv.n_missing_values == -1)
1101 lose ((ME, _("%s: position %d: Missing value indicator field is not "
1102 "-3, -2, 0, 1, 2, or 3."), fh_get_file_name (r->fh), i));
1104 /* Copy first character of variable name. */
1105 if (sv.name[0] == '@' || sv.name[0] == '#')
1106 lose ((ME, _("%s: position %d: Variable name begins with invalid "
1108 fh_get_file_name (r->fh), i));
1110 name[0] = sv.name[0];
1112 /* Copy remaining characters of variable name. */
1113 for (j = 1; j < SHORT_NAME_LEN; j++)
1115 int c = (unsigned char) sv.name[j];
1124 if ( ! var_is_plausible_name(name, false) )
1125 lose ((ME, _("%s: Invalid variable name `%s' within system file."),
1126 fh_get_file_name (r->fh), name));
1128 /* Create variable. */
1129 vv = (*var_by_idx)[i] = dict_create_var (dict, name, sv.type);
1131 lose ((ME, _("%s: Duplicate variable name `%s' within system file."),
1132 fh_get_file_name (r->fh), name));
1134 /* Set the short name the same as the long name */
1135 var_set_short_name (vv, vv->name);
1137 /* Case reading data. */
1138 nv = sv.type == 0 ? 1 : DIV_RND_UP (sv.type, sizeof (flt64));
1139 long_string_count = nv - 1;
1142 /* Get variable label, if any. */
1143 if (sv.has_var_label == 1)
1148 /* Read length of label. */
1149 assertive_buf_read (r, &len, sizeof len, 0);
1150 if (r->reverse_endian)
1154 if (len < 0 || len > 255)
1155 lose ((ME, _("%s: Variable %s indicates variable label of invalid "
1157 fh_get_file_name (r->fh), vv->name, len));
1161 /* Read label into variable structure. */
1162 vv->label = buf_read (r, NULL, ROUND_UP (len, sizeof (int32_t)), len + 1);
1163 if (vv->label == NULL)
1165 vv->label[len] = '\0';
1169 /* Set missing values. */
1170 if (sv.n_missing_values != 0)
1173 int mv_cnt = abs (sv.n_missing_values);
1175 if (vv->width > MAX_SHORT_STRING)
1176 lose ((ME, _("%s: Long string variable %s may not have missing "
1178 fh_get_file_name (r->fh), vv->name));
1180 assertive_buf_read (r, mv, sizeof *mv * mv_cnt, 0);
1182 if (r->reverse_endian && vv->type == NUMERIC)
1183 for (j = 0; j < mv_cnt; j++)
1184 bswap_flt64 (&mv[j]);
1186 if (sv.n_missing_values > 0)
1188 for (j = 0; j < sv.n_missing_values; j++)
1189 if (vv->type == NUMERIC)
1190 mv_add_num (&vv->miss, mv[j]);
1192 mv_add_str (&vv->miss, (char *) &mv[j]);
1196 if (vv->type == ALPHA)
1197 lose ((ME, _("%s: String variable %s may not have missing "
1198 "values specified as a range."),
1199 fh_get_file_name (r->fh), vv->name));
1201 if (mv[0] == r->lowest)
1202 mv_add_num_range (&vv->miss, LOWEST, mv[1]);
1203 else if (mv[1] == r->highest)
1204 mv_add_num_range (&vv->miss, mv[0], HIGHEST);
1206 mv_add_num_range (&vv->miss, mv[0], mv[1]);
1208 if (sv.n_missing_values == -3)
1209 mv_add_num (&vv->miss, mv[2]);
1213 if (!parse_format_spec (r, sv.print, &vv->print, vv)
1214 || !parse_format_spec (r, sv.write, &vv->write, vv))
1217 if ( vv->width != -1)
1218 hsh_insert(r->var_hash, vv);
1221 /* Some consistency checks. */
1222 if (long_string_count != 0)
1223 lose ((ME, _("%s: Long string continuation records omitted at end of "
1225 fh_get_file_name (r->fh)));
1227 if (next_value != r->value_cnt)
1228 corrupt_msg(MW, _("%s: System file header indicates %d variable positions but "
1229 "%d were read from file."),
1230 fh_get_file_name (r->fh), r->value_cnt, next_value);
1239 /* Translates the format spec from sysfile format to internal
1242 parse_format_spec (struct sfm_reader *r, int32_t s,
1243 struct fmt_spec *f, const struct variable *v)
1245 f->type = translate_fmt ((s >> 16) & 0xff);
1247 lose ((ME, _("%s: Bad format specifier byte (%d)."),
1248 fh_get_file_name (r->fh), (s >> 16) & 0xff));
1249 f->w = (s >> 8) & 0xff;
1252 if ((v->type == ALPHA) ^ ((formats[f->type].cat & FCAT_STRING) != 0))
1253 lose ((ME, _("%s: %s variable %s has %s format specifier %s."),
1254 fh_get_file_name (r->fh),
1255 v->type == ALPHA ? _("String") : _("Numeric"),
1257 formats[f->type].cat & FCAT_STRING ? _("string") : _("numeric"),
1258 formats[f->type].name));
1260 if (!check_output_specifier (f, false)
1261 || !check_specifier_width (f, v->width, false))
1263 msg (ME, _("%s variable %s has invalid format specifier %s."),
1264 v->type == NUMERIC ? _("Numeric") : _("String"),
1265 v->name, fmt_to_string (f));
1266 *f = v->type == NUMERIC ? f8_2 : make_output_format (FMT_A, v->width, 0);
1274 /* Reads value labels from sysfile H and inserts them into the
1275 associated dictionary. */
1277 read_value_labels (struct sfm_reader *r,
1278 struct dictionary *dict, struct variable **var_by_idx)
1282 char raw_value[8]; /* Value as uninterpreted bytes. */
1283 union value value; /* Value. */
1284 char *label; /* Null-terminated label string. */
1287 struct label *labels = NULL;
1288 int32_t n_labels; /* Number of labels. */
1290 struct variable **var = NULL; /* Associated variables. */
1291 int32_t n_vars; /* Number of associated variables. */
1295 /* First step: read the contents of the type 3 record and record its
1296 contents. Note that we can't do much with the data since we
1297 don't know yet whether it is of numeric or string type. */
1299 /* Read number of labels. */
1300 assertive_buf_read (r, &n_labels, sizeof n_labels, 0);
1301 if (r->reverse_endian)
1302 bswap_int32 (&n_labels);
1304 if ( n_labels >= ((int32_t) ~0) / sizeof *labels)
1306 corrupt_msg(MW, _("%s: Invalid number of labels: %d. Ignoring labels."),
1307 fh_get_file_name (r->fh), n_labels);
1311 /* Allocate memory. */
1312 labels = xcalloc (n_labels, sizeof *labels);
1313 for (i = 0; i < n_labels; i++)
1314 labels[i].label = NULL;
1316 /* Read each value/label tuple into labels[]. */
1317 for (i = 0; i < n_labels; i++)
1319 struct label *label = labels + i;
1320 unsigned char label_len;
1324 assertive_buf_read (r, label->raw_value, sizeof label->raw_value, 0);
1326 /* Read label length. */
1327 assertive_buf_read (r, &label_len, sizeof label_len, 0);
1328 padded_len = ROUND_UP (label_len + 1, sizeof (flt64));
1330 /* Read label, padding. */
1331 label->label = xmalloc (padded_len + 1);
1332 assertive_buf_read (r, label->label, padded_len - 1, 0);
1333 label->label[label_len] = 0;
1336 /* Second step: Read the type 4 record that has the list of
1337 variables to which the value labels are to be applied. */
1339 /* Read record type of type 4 record. */
1343 assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
1344 if (r->reverse_endian)
1345 bswap_int32 (&rec_type);
1348 lose ((ME, _("%s: Variable index record (type 4) does not immediately "
1349 "follow value label record (type 3) as it should."),
1350 fh_get_file_name (r->fh)));
1353 /* Read number of variables associated with value label from type 4
1355 assertive_buf_read (r, &n_vars, sizeof n_vars, 0);
1356 if (r->reverse_endian)
1357 bswap_int32 (&n_vars);
1358 if (n_vars < 1 || n_vars > dict_get_var_cnt (dict))
1359 lose ((ME, _("%s: Number of variables associated with a value label (%d) "
1360 "is not between 1 and the number of variables (%d)."),
1361 fh_get_file_name (r->fh), n_vars, dict_get_var_cnt (dict)));
1363 /* Read the list of variables. */
1364 var = xnmalloc (n_vars, sizeof *var);
1365 for (i = 0; i < n_vars; i++)
1370 /* Read variable index, check range. */
1371 assertive_buf_read (r, &var_idx, sizeof var_idx, 0);
1372 if (r->reverse_endian)
1373 bswap_int32 (&var_idx);
1374 if (var_idx < 1 || var_idx > r->value_cnt)
1375 lose ((ME, _("%s: Variable index associated with value label (%d) is "
1376 "not between 1 and the number of values (%d)."),
1377 fh_get_file_name (r->fh), var_idx, r->value_cnt));
1379 /* Make sure it's a real variable. */
1380 v = var_by_idx[var_idx - 1];
1382 lose ((ME, _("%s: Variable index associated with value label (%d) "
1383 "refers to a continuation of a string variable, not to "
1384 "an actual variable."),
1385 fh_get_file_name (r->fh), var_idx));
1386 if (v->type == ALPHA && v->width > MAX_SHORT_STRING)
1387 lose ((ME, _("%s: Value labels are not allowed on long string "
1389 fh_get_file_name (r->fh), v->name));
1391 /* Add it to the list of variables. */
1395 /* Type check the variables. */
1396 for (i = 1; i < n_vars; i++)
1397 if (var[i]->type != var[0]->type)
1398 lose ((ME, _("%s: Variables associated with value label are not all of "
1399 "identical type. Variable %s has %s type, but variable "
1401 fh_get_file_name (r->fh),
1402 var[0]->name, var[0]->type == ALPHA ? _("string") : _("numeric"),
1403 var[i]->name, var[i]->type == ALPHA ? _("string") : _("numeric")));
1405 /* Fill in labels[].value, now that we know the desired type. */
1406 for (i = 0; i < n_labels; i++)
1408 struct label *label = labels + i;
1410 if (var[0]->type == ALPHA)
1412 const int copy_len = min (sizeof label->raw_value,
1413 sizeof label->label);
1414 memcpy (label->value.s, label->raw_value, copy_len);
1417 assert (sizeof f == sizeof label->raw_value);
1418 memcpy (&f, label->raw_value, sizeof f);
1419 if (r->reverse_endian)
1425 /* Assign the value_label's to each variable. */
1426 for (i = 0; i < n_vars; i++)
1428 struct variable *v = var[i];
1431 /* Add each label to the variable. */
1432 for (j = 0; j < n_labels; j++)
1434 struct label *label = labels + j;
1435 if (!val_labs_replace (v->val_labs, label->value, label->label))
1438 if (var[0]->type == NUMERIC)
1439 msg (MW, _("%s: File contains duplicate label for value %g for "
1441 fh_get_file_name (r->fh), label->value.f, v->name);
1443 msg (MW, _("%s: File contains duplicate label for value `%.*s' "
1444 "for variable %s."),
1445 fh_get_file_name (r->fh), v->width, label->value.s, v->name);
1449 for (i = 0; i < n_labels; i++)
1450 free (labels[i].label);
1458 for (i = 0; i < n_labels; i++)
1459 free (labels[i].label);
1466 /* Reads BYTE_CNT bytes from the file represented by H. If BUF is
1467 non-NULL, uses that as the buffer; otherwise allocates at least
1468 MIN_ALLOC bytes. Returns a pointer to the buffer on success, NULL
1471 buf_read (struct sfm_reader *r, void *buf, size_t byte_cnt, size_t min_alloc)
1475 if (buf == NULL && byte_cnt > 0 )
1476 buf = xmalloc (max (byte_cnt, min_alloc));
1478 if ( byte_cnt == 0 )
1482 if (1 != fread (buf, byte_cnt, 1, r->file))
1484 if (ferror (r->file))
1485 msg (ME, _("%s: Reading system file: %s."),
1486 fh_get_file_name (r->fh), strerror (errno));
1488 corrupt_msg (ME, _("%s: Unexpected end of file."),
1489 fh_get_file_name (r->fh));
1497 /* Winds the reader BYTE_CNT bytes back in the reader stream. */
1499 buf_unread(struct sfm_reader *r, size_t byte_cnt)
1501 assert(byte_cnt > 0);
1503 if ( 0 != fseek(r->file, -byte_cnt, SEEK_CUR))
1505 msg (ME, _("%s: Seeking system file: %s."),
1506 fh_get_file_name (r->fh), strerror (errno));
1510 /* Reads a document record, type 6, from system file R, and sets up
1511 the documents and n_documents fields in the associated
1514 read_documents (struct sfm_reader *r, struct dictionary *dict)
1519 if (dict_get_documents (dict) != NULL)
1520 lose ((ME, _("%s: System file contains multiple "
1521 "type 6 (document) records."),
1522 fh_get_file_name (r->fh)));
1524 assertive_buf_read (r, &line_cnt, sizeof line_cnt, 0);
1526 lose ((ME, _("%s: Number of document lines (%ld) "
1527 "must be greater than 0."),
1528 fh_get_file_name (r->fh), (long) line_cnt));
1530 documents = buf_read (r, NULL, 80 * line_cnt, line_cnt * 80 + 1);
1531 /* FIXME? Run through asciify. */
1532 if (documents == NULL)
1534 documents[80 * line_cnt] = '\0';
1535 dict_set_documents (dict, documents);
1545 /* Reads compressed data into H->BUF and sets other pointers
1546 appropriately. Returns nonzero only if both no errors occur and
1549 buffer_input (struct sfm_reader *r)
1556 r->buf = xnmalloc (128, sizeof *r->buf);
1557 amt = fread (r->buf, sizeof *r->buf, 128, r->file);
1558 if (ferror (r->file))
1560 msg (ME, _("%s: Error reading file: %s."),
1561 fh_get_file_name (r->fh), strerror (errno));
1566 r->end = &r->buf[amt];
1570 /* Reads a single case consisting of compressed data from system
1571 file H into the array BUF[] according to reader R, and
1572 returns nonzero only if successful. */
1573 /* Data in system files is compressed in this manner. Data
1574 values are grouped into sets of eight ("octets"). Each value
1575 in an octet has one instruction byte that are output together.
1576 Each instruction byte gives a value for that byte or indicates
1577 that the value can be found following the instructions. */
1579 read_compressed_data (struct sfm_reader *r, flt64 *buf)
1581 const unsigned char *p_end = r->x + sizeof (flt64);
1582 unsigned char *p = r->y;
1584 const flt64 *buf_beg = buf;
1585 const flt64 *buf_end = &buf[r->value_cnt];
1589 for (; p < p_end; p++){
1593 /* Code 0 is ignored. */
1596 /* Code 252 is end of file. */
1599 lose ((ME, _("%s: Compressed data is corrupted. Data ends "
1600 "in partial case."),
1601 fh_get_file_name (r->fh)));
1603 /* Code 253 indicates that the value is stored explicitly
1604 following the instruction bytes. */
1605 if (r->ptr == NULL || r->ptr >= r->end)
1606 if (!buffer_input (r))
1607 lose ((ME, _("%s: Unexpected end of file."),
1608 fh_get_file_name (r->fh)));
1609 memcpy (buf++, r->ptr++, sizeof *buf);
1614 /* Code 254 indicates a string that is all blanks. */
1615 memset (buf++, ' ', sizeof *buf);
1620 /* Code 255 indicates the system-missing value. */
1622 if (r->reverse_endian)
1629 /* Codes 1 through 251 inclusive are taken to indicate a
1630 value of (BYTE - BIAS), where BYTE is the byte's value
1631 and BIAS is the compression bias (generally 100.0). */
1632 *buf = *p - r->bias;
1633 if (r->reverse_endian)
1641 /* We have reached the end of this instruction octet. Read
1643 if (r->ptr == NULL || r->ptr >= r->end)
1645 if (!buffer_input (r))
1648 lose ((ME, _("%s: Unexpected end of file."),
1649 fh_get_file_name (r->fh)));
1654 memcpy (r->x, r->ptr++, sizeof *buf);
1661 /* We have filled up an entire record. Update state and return
1674 compare_var_index(const void *_v1, const void *_v2, void *aux UNUSED)
1676 const struct variable *const *v1 = _v1;
1677 const struct variable *const *v2 = _v2;
1679 if ( (*v1)->index < (*v2)->index)
1682 return ( (*v1)->index > (*v2)->index) ;
1686 /* Reads one case from READER's file into C. Returns nonzero
1687 only if successful. */
1689 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1696 r->svars = (struct variable **) hsh_data(r->var_hash);
1697 sort(r->svars, hsh_count(r->var_hash),
1698 sizeof(*r->svars), compare_var_index, 0);
1701 if (!r->compressed && sizeof (flt64) == sizeof (double) && ! r->has_vls)
1703 /* Fast path: external and internal representations are the
1704 same, except possibly for endianness or SYSMIS. Read
1705 directly into the case's buffer, then fix up any minor
1706 details as needed. */
1707 if (!fread_ok (r, case_data_all_rw (c),
1708 sizeof (union value) * r->value_cnt))
1711 /* Fix up endianness if needed. */
1712 if (r->reverse_endian)
1716 for (i = 0; i < hsh_count(r->var_hash); i++)
1718 struct variable *v = r->svars[i];
1720 bswap_flt64 (&case_data_rw (c, v->fv)->f);
1724 /* Fix up SYSMIS values if needed.
1725 I don't think this will ever actually kick in, but it
1727 if (r->sysmis != SYSMIS)
1730 for (i = 0; i < hsh_count(r->var_hash); i++)
1732 struct variable *v = r->svars[i];
1733 if (v->width == 0 && case_num (c, i) == r->sysmis)
1734 case_data_rw (c, v->fv)->f = SYSMIS;
1740 /* Slow path: internal and external representations differ.
1741 Read into a bounce buffer, then copy to C. */
1748 bounce_size = sizeof *bounce * r->value_cnt;
1749 bounce = bounce_cur = local_alloc (bounce_size);
1751 memset(bounce, 0, bounce_size);
1754 read_ok = fread_ok (r, bounce, bounce_size);
1756 read_ok = read_compressed_data (r, bounce);
1759 local_free (bounce);
1763 for (i = 0; i < hsh_count(r->var_hash); i++)
1765 struct variable *tv = r->svars[i];
1769 flt64 f = *bounce_cur++;
1770 if (r->reverse_endian)
1772 case_data_rw (c, tv->fv)->f = f == r->sysmis ? SYSMIS : f;
1774 else if (tv->width != -1)
1776 flt64 *bc_start = bounce_cur;
1778 while (ofs < tv->width )
1780 const int chunk = MIN (MAX_LONG_STRING, tv->width - ofs);
1781 memcpy (case_data_rw (c, tv->fv)->s + ofs, bounce_cur, chunk);
1783 bounce_cur += DIV_RND_UP (chunk, sizeof (flt64));
1787 bounce_cur = bc_start + width_to_bytes(tv->width) / sizeof(flt64);
1791 local_free (bounce);
1797 fread_ok (struct sfm_reader *r, void *buffer, size_t byte_cnt)
1799 size_t read_bytes = fread (buffer, 1, byte_cnt, r->file);
1801 if (read_bytes == byte_cnt)
1805 if (ferror (r->file))
1807 msg (ME, _("%s: Reading system file: %s."),
1808 fh_get_file_name (r->fh), strerror (errno));
1811 else if (read_bytes != 0)
1813 msg (ME, _("%s: Partial record at end of system file."),
1814 fh_get_file_name (r->fh));
1821 /* Returns true if an I/O error has occurred on READER, false
1824 sfm_read_error (const struct sfm_reader *reader)
1829 /* Returns true if FILE is an SPSS system file,
1832 sfm_detect (FILE *file)
1834 struct sysfile_header hdr;
1836 if (fread (&hdr, sizeof hdr, 1, file) != 1)
1838 if (strncmp ("$FL2", hdr.rec_type, 4))