1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include "sys-file-reader.h"
23 #include "sfm-private.h"
24 #include "sys-file-private.h"
32 #include <libpspp/alloc.h>
33 #include <libpspp/assertion.h>
34 #include <libpspp/message.h>
35 #include <libpspp/compiler.h>
36 #include <libpspp/magic.h>
37 #include <libpspp/misc.h>
38 #include <libpspp/str.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/array.h>
43 #include "dictionary.h"
44 #include "file-handle-def.h"
45 #include "file-name.h"
47 #include "missing-values.h"
48 #include "value-labels.h"
53 #define _(msgid) gettext (msgid)
55 /* System file reader. */
58 struct file_handle *fh; /* File handle. */
59 FILE *file; /* File stream. */
61 int reverse_endian; /* 1=file has endianness opposite us. */
62 int value_cnt; /* Number of `union values's per case. */
63 long case_cnt; /* Number of cases, -1 if unknown. */
64 int compressed; /* 1=compressed, 0=not compressed. */
65 double bias; /* Compression bias, usually 100.0. */
66 int weight_idx; /* 0-based index of weighting variable, or -1. */
67 bool ok; /* False after an I/O error or corrupt data. */
68 bool has_vls; /* True if the file has one or more Very Long Strings*/
74 /* File's special constants. */
79 /* Decompression buffer. */
80 flt64 *buf; /* Buffer data. */
81 flt64 *ptr; /* Current location in buffer. */
82 flt64 *end; /* End of buffer data. */
84 /* Compression instruction octet. */
85 unsigned char x[8]; /* Current instruction octet. */
86 unsigned char *y; /* Location in current instruction octet. */
89 /* A variable in a system file. */
92 int width; /* 0=numeric, otherwise string width. */
93 int fv; /* Index into case. */
98 /* Swap bytes *A and *B. */
100 bswap (char *a, char *b)
107 /* Reverse the byte order of 32-bit integer *X. */
109 bswap_int32 (int32_t *x_)
111 char *x = (char *) x_;
112 bswap (x + 0, x + 3);
113 bswap (x + 1, x + 2);
116 /* Reverse the byte order of 64-bit floating point *X. */
118 bswap_flt64 (flt64 *x_)
120 char *x = (char *) x_;
121 bswap (x + 0, x + 7);
122 bswap (x + 1, x + 6);
123 bswap (x + 2, x + 5);
124 bswap (x + 3, x + 4);
128 corrupt_msg (int class, const char *format,...)
129 PRINTF_FORMAT (2, 3);
131 /* Displays a corrupt sysfile error. */
133 corrupt_msg (int class, const char *format,...)
139 ds_init_cstr (&text, _("corrupt system file: "));
140 va_start (args, format);
141 ds_put_vformat (&text, format, args);
144 m.category = msg_class_to_category (class);
145 m.severity = msg_class_to_severity (class);
146 m.where.file_name = NULL;
147 m.where.line_number = 0;
148 m.text = ds_cstr (&text);
153 /* Closes a system file after we're done with it. */
155 sfm_close_reader (struct sfm_reader *r)
162 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
163 msg (ME, _("%s: Closing system file: %s."),
164 fh_get_file_name (r->fh), strerror (errno));
169 fh_close (r->fh, "system file", "rs");
176 /* Dictionary reader. */
178 static void buf_unread(struct sfm_reader *r, size_t byte_cnt);
180 static void *buf_read (struct sfm_reader *, void *buf, size_t byte_cnt,
183 static int read_header (struct sfm_reader *,
184 struct dictionary *, struct sfm_read_info *);
185 static int parse_format_spec (struct sfm_reader *, int32_t,
186 struct fmt_spec *, const struct variable *);
187 static int read_value_labels (struct sfm_reader *, struct dictionary *,
188 struct variable **var_by_idx);
189 static int read_variables (struct sfm_reader *,
190 struct dictionary *, struct variable ***var_by_idx);
191 static int read_machine_int32_info (struct sfm_reader *, int size, int count);
192 static int read_machine_flt64_info (struct sfm_reader *, int size, int count);
193 static int read_documents (struct sfm_reader *, struct dictionary *);
195 static int fread_ok (struct sfm_reader *, void *, size_t);
197 /* Displays the message X with corrupt_msg, then jumps to the error
205 /* Calls buf_read with the specified arguments, and jumps to
206 error if the read fails. */
207 #define assertive_buf_read(a,b,c,d) \
209 if (!buf_read (a,b,c,d)) \
221 pair_sn_compare(const void *_p1, const void *_p2, const void *aux UNUSED)
225 const struct name_pair *p1 = _p1;
226 const struct name_pair *p2 = _p2;
228 char buf1[SHORT_NAME_LEN + 1];
229 char buf2[SHORT_NAME_LEN + 1];
231 memset(buf1, 0, SHORT_NAME_LEN + 1);
232 memset(buf2, 0, SHORT_NAME_LEN + 1);
234 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
236 buf1[i] = p1->shortname[i];
237 if ( '\0' == buf1[i])
241 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
243 buf2[i] = p2->shortname[i];
244 if ( '\0' == buf2[i])
248 return strncmp(buf1, buf2, SHORT_NAME_LEN);
252 pair_sn_hash(const void *_p, const void *aux UNUSED)
255 const struct name_pair *p = _p;
256 char buf[SHORT_NAME_LEN + 1];
258 memset(buf, 0, SHORT_NAME_LEN + 1);
259 for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
261 buf[i] = p->shortname[i];
266 return hsh_hash_bytes(buf, strlen(buf));
270 pair_sn_free(void *p, const void *aux UNUSED)
277 /* Opens the system file designated by file handle FH for
278 reading. Reads the system file's dictionary into *DICT.
279 If INFO is non-null, then it receives additional info about the
282 sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
283 struct sfm_read_info *info)
285 struct sfm_reader *r = NULL;
286 struct variable **var_by_idx = NULL;
288 /* The data in record 7(14) */
289 char *subrec14data = 0;
291 /* A hash table of long variable names indexed by short name */
292 struct hsh_table *short_to_long = NULL;
294 *dict = dict_create ();
295 if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
298 /* Create and initialize reader. */
299 r = xmalloc (sizeof *r);
301 r->file = fn_open (fh_get_file_name (fh), "rb");
303 r->reverse_endian = 0;
314 r->sysmis = -FLT64_MAX;
315 r->highest = FLT64_MAX;
316 r->lowest = second_lowest_flt64;
318 r->buf = r->ptr = r->end = NULL;
319 r->y = r->x + sizeof r->x;
321 /* Check that file open succeeded. */
324 msg (ME, _("An error occurred while opening \"%s\" for reading "
325 "as a system file: %s."),
326 fh_get_file_name (r->fh), strerror (errno));
330 /* Read header and variables. */
331 if (!read_header (r, *dict, info) || !read_variables (r, *dict, &var_by_idx))
335 /* Handle weighting. */
336 if (r->weight_idx != -1)
338 struct variable *weight_var;
340 if (r->weight_idx < 0 || r->weight_idx >= r->value_cnt)
341 lose ((ME, _("%s: Index of weighting variable (%d) is not between 0 "
342 "and number of elements per case (%d)."),
343 fh_get_file_name (r->fh), r->weight_idx, r->value_cnt));
346 weight_var = var_by_idx[r->weight_idx];
348 if (weight_var == NULL)
350 _("%s: Weighting variable may not be a continuation of "
351 "a long string variable."), fh_get_file_name (fh)));
352 else if (var_is_alpha (weight_var))
353 lose ((ME, _("%s: Weighting variable may not be a string variable."),
354 fh_get_file_name (fh)));
356 dict_set_weight (*dict, weight_var);
359 dict_set_weight (*dict, NULL);
361 /* Read records of types 3, 4, 6, and 7. */
366 assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
367 if (r->reverse_endian)
368 bswap_int32 (&rec_type);
374 if (!read_value_labels (r, *dict, var_by_idx))
379 lose ((ME, _("%s: Orphaned variable index record (type 4). Type 4 "
380 "records must always immediately follow type 3 "
382 fh_get_file_name (r->fh)));
385 if (!read_documents (r, *dict))
396 } ATTRIBUTE((packed))
402 assertive_buf_read (r, &data, sizeof data, 0);
403 if (r->reverse_endian)
405 bswap_int32 (&data.subtype);
406 bswap_int32 (&data.size);
407 bswap_int32 (&data.count);
409 bytes = data.size * data.count;
411 if (bytes < data.size || bytes < data.count)
412 lose ((ME, "%s: Record type %d subtype %d too large.",
413 fh_get_file_name (r->fh), rec_type, data.subtype));
415 switch (data.subtype)
418 if (!read_machine_int32_info (r, data.size, data.count))
423 if (!read_machine_flt64_info (r, data.size, data.count))
428 case 6: /* ?? Used by SPSS 8.0. */
432 case 11: /* Variable display parameters */
434 const int n_vars = data.count / 3 ;
436 if ( data.count % 3 || n_vars != dict_get_var_cnt(*dict) )
438 msg (MW, _("%s: Invalid subrecord length. "
439 "Record: 7; Subrecord: 11"),
440 fh_get_file_name (r->fh));
445 for ( i = 0 ; i < MIN(n_vars, dict_get_var_cnt(*dict)) ; ++i )
452 } ATTRIBUTE((packed))
457 assertive_buf_read (r, ¶ms, sizeof(params), 0);
459 if ( ! measure_is_valid(params.measure)
461 ! alignment_is_valid(params.align))
464 _("%s: Invalid variable display parameters. Default parameters substituted."),
465 fh_get_file_name(r->fh));
469 v = dict_get_var(*dict, i);
471 var_set_measure (v, params.measure);
472 var_set_display_width (v, params.width);
473 var_set_alignment (v, params.align);
478 case 13: /* SPSS 12.0 Long variable name map */
481 char *save_ptr = NULL;
485 subrec14data = xmalloc (bytes + 1);
486 if (!buf_read (r, subrec14data, bytes, 0))
490 subrec14data[bytes] = '\0';
492 short_to_long = hsh_create(4,
499 for (short_name = strtok_r (subrec14data, "=", &save_ptr), idx = 0;
501 short_name = strtok_r (NULL, "=", &save_ptr), idx++)
503 struct name_pair *pair ;
504 char *long_name = strtok_r (NULL, "\t", &save_ptr);
507 /* Validate long name. */
508 if (long_name == NULL)
510 msg (MW, _("%s: Trailing garbage in long variable "
512 fh_get_file_name (r->fh));
515 if (!var_is_valid_name (long_name, false))
517 msg (MW, _("%s: Long variable mapping to invalid "
518 "variable name `%s'."),
519 fh_get_file_name (r->fh), long_name);
523 /* Find variable using short name. */
524 v = dict_lookup_var (*dict, short_name);
527 msg (MW, _("%s: Long variable mapping for "
528 "nonexistent variable %s."),
529 fh_get_file_name (r->fh), short_name);
533 /* Identify any duplicates. */
534 if ( strcasecmp (short_name, long_name) &&
535 NULL != dict_lookup_var (*dict, long_name))
536 lose ((ME, _("%s: Duplicate long variable name `%s' "
537 "within system file."),
538 fh_get_file_name (r->fh), long_name));
542 Renaming a variable may clear the short
543 name, but we want to retain it, so
544 re-set it explicitly. */
545 dict_rename_var (*dict, v, long_name);
546 var_set_short_name (v, short_name);
548 pair = xmalloc(sizeof *pair);
549 pair->shortname = short_name;
550 pair->longname = long_name;
551 hsh_insert(short_to_long, pair);
553 /* This messes up the processing of subtype 14 (below).
554 I'm not sure if it is needed anyway, so I'm removing it for
555 now. If it's needed, then it will need to be done after all the
556 records have been processed. --- JMD 27 April 2006
559 /* For compatibility, make sure dictionary
560 is in long variable name map order. In
561 the common case, this has no effect,
562 because the dictionary and the long
563 variable name map are already in the
565 dict_reorder_var (*dict, v, idx);
575 bool eq_seen = false;
579 char *buffer = xmalloc (bytes + 1);
580 if (!buf_read (r, buffer, bytes, 0))
585 buffer[bytes] = '\0';
589 /* Note: SPSS v13 terminates this record with 00,
590 whereas SPSS v14 terminates it with 00 09. We must
592 for(i = 0; i < bytes ; ++i)
595 static char name[SHORT_NAME_LEN + 1] = {0};
596 static char len_str[6] ={0};
605 length = strtol(len_str, 0, 10);
606 if ( length != LONG_MAX && length != LONG_MIN)
608 char *lookup_name = name;
615 struct name_pair pair;
618 pair.shortname = name;
619 p = hsh_find(short_to_long, &pair);
621 lookup_name = p->longname;
624 v = dict_lookup_var(*dict, lookup_name);
628 _("%s: No variable called %s but it is listed in length table."),
629 fh_get_file_name (r->fh), lookup_name);
636 if ( var_get_width (v) > EFFECTIVE_LONG_STRING_LENGTH )
637 l -= EFFECTIVE_LONG_STRING_LENGTH;
639 l -= var_get_width (v);
641 idx = var_get_dict_index (v);
644 struct variable *v_next;
645 v_next = dict_get_var(*dict, idx + 1);
647 if ( var_get_width (v_next) > EFFECTIVE_LONG_STRING_LENGTH )
648 l -= EFFECTIVE_LONG_STRING_LENGTH;
650 l -= var_get_width (v_next);
652 dict_delete_var(*dict, v_next);
655 assert ( length >= MIN_VERY_LONG_STRING );
657 var_set_width (v, length);
660 memset(name, 0, SHORT_NAME_LEN+1);
661 memset(len_str, 0, 6);
668 len_str[j] = buffer[i];
676 dict_compact_values(*dict);
681 msg (MW, _("%s: Unrecognized record type 7, subtype %d "
682 "encountered in system file."),
683 fh_get_file_name (r->fh), data.subtype);
689 void *x = buf_read (r, NULL, data.size * data.count, 0);
701 assertive_buf_read (r, &filler, sizeof filler, 0);
707 corrupt_msg(MW, _("%s: Unrecognized record type %d."),
708 fh_get_file_name (r->fh), rec_type);
713 /* Come here on successful completion. */
715 /* Create an index of dictionary variable widths for
716 sfm_read_case to use. We cannot use the `struct variables'
717 from the dictionary we created, because the caller owns the
718 dictionary and may destroy or modify its variables. */
722 r->var_cnt = dict_get_var_cnt (*dict);
723 r->vars = xnmalloc (r->var_cnt, sizeof *r->vars);
724 for (i = 0; i < r->var_cnt; i++)
726 struct variable *v = dict_get_var (*dict, i);
727 struct sfm_var *sv = &r->vars[i];
728 sv->width = var_get_width (v);
729 sv->fv = var_get_case_index (v);
734 hsh_destroy(short_to_long);
739 /* Come here on unsuccessful completion. */
740 sfm_close_reader (r);
742 hsh_destroy(short_to_long);
746 dict_destroy (*dict);
752 /* Read record type 7, subtype 3. */
754 read_machine_int32_info (struct sfm_reader *r, int size, int count)
761 if (size != sizeof (int32_t) || count != 8)
762 lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
763 "subtype 3. Expected size %d, count 8."),
764 fh_get_file_name (r->fh), size, count, sizeof (int32_t)));
766 assertive_buf_read (r, data, sizeof data, 0);
767 if (r->reverse_endian)
768 for (i = 0; i < 8; i++)
769 bswap_int32 (&data[i]);
773 lose ((ME, _("%s: Floating-point representation in system file is not "
774 "IEEE-754. PSPP cannot convert between floating-point "
776 fh_get_file_name (r->fh)));
778 #error Add support for your floating-point format.
781 #ifdef WORDS_BIGENDIAN
786 if (r->reverse_endian)
788 if (file_bigendian ^ (data[6] == 1))
789 lose ((ME, _("%s: File-indicated endianness (%s) does not match "
790 "endianness intuited from file header (%s)."),
791 fh_get_file_name (r->fh),
792 file_bigendian ? _("big-endian") : _("little-endian"),
793 data[6] == 1 ? _("big-endian") : (data[6] == 2 ? _("little-endian")
796 /* PORTME: Character representation code. */
797 if (data[7] != 2 && data[7] != 3)
798 lose ((ME, _("%s: File-indicated character representation code (%s) is "
800 fh_get_file_name (r->fh),
801 (data[7] == 1 ? "EBCDIC"
802 : (data[7] == 4 ? _("DEC Kanji") : _("Unknown")))));
810 /* Read record type 7, subtype 4. */
812 read_machine_flt64_info (struct sfm_reader *r, int size, int count)
817 if (size != sizeof (flt64) || count != 3)
818 lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
819 "subtype 4. Expected size %d, count 8."),
820 fh_get_file_name (r->fh), size, count, sizeof (flt64)));
822 assertive_buf_read (r, data, sizeof data, 0);
823 if (r->reverse_endian)
824 for (i = 0; i < 3; i++)
825 bswap_flt64 (&data[i]);
827 if (data[0] != SYSMIS || data[1] != FLT64_MAX
828 || data[2] != second_lowest_flt64)
831 r->highest = data[1];
833 msg (MW, _("%s: File-indicated value is different from internal value "
834 "for at least one of the three system values. SYSMIS: "
835 "indicated %g, expected %g; HIGHEST: %g, %g; LOWEST: "
837 fh_get_file_name (r->fh), (double) data[0], (double) SYSMIS,
838 (double) data[1], (double) FLT64_MAX,
839 (double) data[2], (double) second_lowest_flt64);
849 read_header (struct sfm_reader *r,
850 struct dictionary *dict, struct sfm_read_info *info)
852 struct sysfile_header hdr; /* Disk buffer. */
853 char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */
854 int skip_amt = 0; /* Amount of product name to omit. */
857 /* Read header, check magic. */
858 assertive_buf_read (r, &hdr, sizeof hdr, 0);
859 if (strncmp ("$FL2", hdr.rec_type, 4) != 0)
860 lose ((ME, _("%s: Bad magic. Proper system files begin with "
861 "the four characters `$FL2'. This file will not be read."),
862 fh_get_file_name (r->fh)));
864 /* Check eye-category.her string. */
865 memcpy (prod_name, hdr.prod_name, sizeof hdr.prod_name);
866 for (i = 0; i < 60; i++)
867 if (!c_isprint ((unsigned char) prod_name[i]))
869 for (i = 59; i >= 0; i--)
870 if (!c_isgraph ((unsigned char) prod_name[i]))
875 prod_name[60] = '\0';
879 static const char *prefix[N_PREFIXES] =
881 "@(#) SPSS DATA FILE",
887 for (i = 0; i < N_PREFIXES; i++)
888 if (!strncmp (prefix[i], hdr.prod_name, strlen (prefix[i])))
890 skip_amt = strlen (prefix[i]);
895 /* Check endianness. */
896 if (hdr.layout_code == 2)
897 r->reverse_endian = 0;
900 bswap_int32 (&hdr.layout_code);
901 if (hdr.layout_code != 2)
902 lose ((ME, _("%s: File layout code has unexpected value %d. Value "
903 "should be 2, in big-endian or little-endian format."),
904 fh_get_file_name (r->fh), hdr.layout_code));
906 r->reverse_endian = 1;
907 bswap_int32 (&hdr.nominal_case_size);
908 bswap_int32 (&hdr.compress);
909 bswap_int32 (&hdr.weight_idx);
910 bswap_int32 (&hdr.case_cnt);
911 bswap_flt64 (&hdr.bias);
915 /* Copy basic info and verify correctness. */
916 r->value_cnt = hdr.nominal_case_size;
918 /* If value count is ridiculous, then force it to -1 (a
920 if ( r->value_cnt < 0 ||
921 r->value_cnt > (INT_MAX / (int) sizeof (union value) / 2))
924 r->compressed = hdr.compress;
926 r->weight_idx = hdr.weight_idx - 1;
928 r->case_cnt = hdr.case_cnt;
929 if (r->case_cnt < -1 || r->case_cnt > INT_MAX / 2)
931 _("%s: Number of cases in file (%ld) is not between -1 and %d."),
932 fh_get_file_name (r->fh), (long) r->case_cnt, INT_MAX / 2));
935 if (r->bias != 100.0)
936 corrupt_msg (MW, _("%s: Compression bias (%g) is not the usual "
938 fh_get_file_name (r->fh), r->bias);
940 /* Make a file label only on the condition that the given label is
941 not all spaces or nulls. */
945 for (i = sizeof hdr.file_label - 1; i >= 0; i--)
947 if (!c_isspace ((unsigned char) hdr.file_label[i])
948 && hdr.file_label[i] != 0)
950 char *label = xmalloc (i + 2);
951 memcpy (label, hdr.file_label, i + 1);
953 dict_set_label (dict, label);
964 memcpy (info->creation_date, hdr.creation_date, 9);
965 info->creation_date[9] = 0;
967 memcpy (info->creation_time, hdr.creation_time, 8);
968 info->creation_time[8] = 0;
970 #ifdef WORDS_BIGENDIAN
971 info->big_endian = !r->reverse_endian;
973 info->big_endian = r->reverse_endian;
976 info->compressed = hdr.compress;
978 info->case_cnt = hdr.case_cnt;
980 for (cp = &prod_name[skip_amt]; cp < &prod_name[60]; cp++)
981 if (c_isgraph ((unsigned char) *cp))
983 strcpy (info->product, cp);
992 /* Reads most of the dictionary from file H; also fills in the
993 associated VAR_BY_IDX array. */
995 read_variables (struct sfm_reader *r,
996 struct dictionary *dict, struct variable ***var_by_idx)
1000 struct sysfile_variable sv; /* Disk buffer. */
1001 int long_string_count = 0; /* # of long string continuation
1002 records still expected. */
1003 int next_value = 0; /* Index to next `value' structure. */
1010 /* Read in the entry for each variable and use the info to
1011 initialize the dictionary. */
1014 struct variable *vv;
1015 char name[SHORT_NAME_LEN + 1];
1018 struct fmt_spec print, write;
1021 assertive_buf_read (r, &sv, sizeof sv, 0);
1023 if (r->reverse_endian)
1025 bswap_int32 (&sv.rec_type);
1026 bswap_int32 (&sv.type);
1027 bswap_int32 (&sv.has_var_label);
1028 bswap_int32 (&sv.n_missing_values);
1029 bswap_int32 (&sv.print);
1030 bswap_int32 (&sv.write);
1033 /* We've come to the end of the variable entries */
1034 if (sv.rec_type != 2)
1036 buf_unread(r, sizeof sv);
1041 *var_by_idx = xnrealloc (*var_by_idx, i + 1, sizeof **var_by_idx);
1043 /* If there was a long string previously, make sure that the
1044 continuations are present; otherwise make sure there aren't
1046 if (long_string_count)
1049 lose ((ME, _("%s: position %d: String variable does not have "
1050 "proper number of continuation records."),
1051 fh_get_file_name (r->fh), i));
1054 (*var_by_idx)[i] = NULL;
1055 long_string_count--;
1058 else if (sv.type == -1)
1059 lose ((ME, _("%s: position %d: Superfluous long string continuation "
1061 fh_get_file_name (r->fh), i));
1063 /* Check fields for validity. */
1064 if (sv.type < 0 || sv.type > 255)
1065 lose ((ME, _("%s: position %d: Bad variable type code %d."),
1066 fh_get_file_name (r->fh), i, sv.type));
1067 if (sv.has_var_label != 0 && sv.has_var_label != 1)
1068 lose ((ME, _("%s: position %d: Variable label indicator field is not "
1069 "0 or 1."), fh_get_file_name (r->fh), i));
1070 if (sv.n_missing_values < -3 || sv.n_missing_values > 3
1071 || sv.n_missing_values == -1)
1072 lose ((ME, _("%s: position %d: Missing value indicator field is not "
1073 "-3, -2, 0, 1, 2, or 3."), fh_get_file_name (r->fh), i));
1075 /* Copy first character of variable name. */
1076 if (sv.name[0] == '@' || sv.name[0] == '#')
1077 lose ((ME, _("%s: position %d: Variable name begins with invalid "
1079 fh_get_file_name (r->fh), i));
1081 name[0] = sv.name[0];
1083 /* Copy remaining characters of variable name. */
1084 for (j = 1; j < SHORT_NAME_LEN; j++)
1086 int c = (unsigned char) sv.name[j];
1095 if ( ! var_is_plausible_name(name, false) )
1096 lose ((ME, _("%s: Invalid variable name `%s' within system file."),
1097 fh_get_file_name (r->fh), name));
1099 /* Create variable. */
1100 vv = (*var_by_idx)[i] = dict_create_var (dict, name, sv.type);
1102 lose ((ME, _("%s: Duplicate variable name `%s' within system file."),
1103 fh_get_file_name (r->fh), name));
1105 /* Set the short name the same as the long name */
1106 var_set_short_name (vv, var_get_name (vv));
1108 /* Case reading data. */
1109 nv = sv.type == 0 ? 1 : DIV_RND_UP (sv.type, sizeof (flt64));
1110 long_string_count = nv - 1;
1113 /* Get variable label, if any. */
1114 if (sv.has_var_label == 1)
1119 /* Read length of label. */
1120 assertive_buf_read (r, &len, sizeof len, 0);
1121 if (r->reverse_endian)
1125 if (len < 0 || len > 255)
1126 lose ((ME, _("%s: Variable %s indicates variable label of invalid "
1128 fh_get_file_name (r->fh), var_get_name (vv), len));
1132 /* Read label into variable structure. */
1134 assertive_buf_read (r, label, ROUND_UP (len, sizeof (int32_t)),
1137 var_set_label (vv, label);
1141 /* Set missing values. */
1142 if (sv.n_missing_values != 0)
1145 int mv_cnt = abs (sv.n_missing_values);
1146 struct missing_values miss;
1148 if (var_get_width (vv) > MAX_SHORT_STRING)
1149 lose ((ME, _("%s: Long string variable %s may not have missing "
1151 fh_get_file_name (r->fh), var_get_name (vv)));
1152 mv_init (&miss, var_get_width (vv));
1154 assertive_buf_read (r, mv, sizeof *mv * mv_cnt, 0);
1156 if (r->reverse_endian && var_is_numeric (vv))
1157 for (j = 0; j < mv_cnt; j++)
1158 bswap_flt64 (&mv[j]);
1160 if (sv.n_missing_values > 0)
1162 for (j = 0; j < sv.n_missing_values; j++)
1163 if (var_is_numeric (vv))
1164 mv_add_num (&miss, mv[j]);
1166 mv_add_str (&miss, (char *) &mv[j]);
1170 if (var_is_alpha (vv))
1171 lose ((ME, _("%s: String variable %s may not have missing "
1172 "values specified as a range."),
1173 fh_get_file_name (r->fh), var_get_name (vv)));
1175 if (mv[0] == r->lowest)
1176 mv_add_num_range (&miss, LOWEST, mv[1]);
1177 else if (mv[1] == r->highest)
1178 mv_add_num_range (&miss, mv[0], HIGHEST);
1180 mv_add_num_range (&miss, mv[0], mv[1]);
1182 if (sv.n_missing_values == -3)
1183 mv_add_num (&miss, mv[2]);
1185 var_set_missing_values (vv, &miss);
1188 if (!parse_format_spec (r, sv.print, &print, vv)
1189 || !parse_format_spec (r, sv.write, &write, vv))
1192 var_set_print_format (vv, &print);
1193 var_set_write_format (vv, &write);
1196 /* Some consistency checks. */
1197 if (long_string_count != 0)
1198 lose ((ME, _("%s: Long string continuation records omitted at end of "
1200 fh_get_file_name (r->fh)));
1202 if (next_value != r->value_cnt)
1203 corrupt_msg(MW, _("%s: System file header indicates %d variable positions but "
1204 "%d were read from file."),
1205 fh_get_file_name (r->fh), r->value_cnt, next_value);
1214 /* Translates the format spec from sysfile format to internal
1217 parse_format_spec (struct sfm_reader *r, int32_t s,
1218 struct fmt_spec *f, const struct variable *v)
1222 if (!fmt_from_io ((s >> 16) & 0xff, &f->type))
1223 lose ((ME, _("%s: Bad format specifier byte (%d)."),
1224 fh_get_file_name (r->fh), (s >> 16) & 0xff));
1225 f->w = (s >> 8) & 0xff;
1228 if (var_is_alpha (v) != fmt_is_string (f->type))
1229 lose ((ME, _("%s: %s variable %s has %s format specifier %s."),
1230 fh_get_file_name (r->fh),
1231 var_is_alpha (v) ? _("String") : _("Numeric"),
1233 fmt_is_string (f->type) ? _("string") : _("numeric"),
1234 fmt_name (f->type)));
1237 ok = fmt_check_output (f) && fmt_check_width_compat (f, var_get_width (v));
1242 char fmt_string[FMT_STRING_LEN_MAX + 1];
1243 msg (ME, _("%s variable %s has invalid format specifier %s."),
1244 var_is_numeric (v) ? _("Numeric") : _("String"),
1245 var_get_name (v), fmt_to_string (f, fmt_string));
1246 *f = (var_is_numeric (v)
1247 ? fmt_for_output (FMT_F, 8, 2)
1248 : fmt_for_output (FMT_A, var_get_width (v), 0));
1256 /* Reads value labels from sysfile H and inserts them into the
1257 associated dictionary. */
1259 read_value_labels (struct sfm_reader *r,
1260 struct dictionary *dict, struct variable **var_by_idx)
1264 char raw_value[8]; /* Value as uninterpreted bytes. */
1265 union value value; /* Value. */
1266 char *label; /* Null-terminated label string. */
1269 struct label *labels = NULL;
1270 int32_t n_labels; /* Number of labels. */
1272 struct variable **var = NULL; /* Associated variables. */
1273 int32_t n_vars; /* Number of associated variables. */
1277 /* First step: read the contents of the type 3 record and record its
1278 contents. Note that we can't do much with the data since we
1279 don't know yet whether it is of numeric or string type. */
1281 /* Read number of labels. */
1282 assertive_buf_read (r, &n_labels, sizeof n_labels, 0);
1283 if (r->reverse_endian)
1284 bswap_int32 (&n_labels);
1286 if ( n_labels >= ((int32_t) ~0) / sizeof *labels)
1288 corrupt_msg(MW, _("%s: Invalid number of labels: %d. Ignoring labels."),
1289 fh_get_file_name (r->fh), n_labels);
1293 /* Allocate memory. */
1294 labels = xcalloc (n_labels, sizeof *labels);
1295 for (i = 0; i < n_labels; i++)
1296 labels[i].label = NULL;
1298 /* Read each value/label tuple into labels[]. */
1299 for (i = 0; i < n_labels; i++)
1301 struct label *label = labels + i;
1302 unsigned char label_len;
1306 assertive_buf_read (r, label->raw_value, sizeof label->raw_value, 0);
1308 /* Read label length. */
1309 assertive_buf_read (r, &label_len, sizeof label_len, 0);
1310 padded_len = ROUND_UP (label_len + 1, sizeof (flt64));
1312 /* Read label, padding. */
1313 label->label = xmalloc (padded_len + 1);
1314 assertive_buf_read (r, label->label, padded_len - 1, 0);
1315 label->label[label_len] = 0;
1318 /* Second step: Read the type 4 record that has the list of
1319 variables to which the value labels are to be applied. */
1321 /* Read record type of type 4 record. */
1325 assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
1326 if (r->reverse_endian)
1327 bswap_int32 (&rec_type);
1330 lose ((ME, _("%s: Variable index record (type 4) does not immediately "
1331 "follow value label record (type 3) as it should."),
1332 fh_get_file_name (r->fh)));
1335 /* Read number of variables associated with value label from type 4
1337 assertive_buf_read (r, &n_vars, sizeof n_vars, 0);
1338 if (r->reverse_endian)
1339 bswap_int32 (&n_vars);
1340 if (n_vars < 1 || n_vars > dict_get_var_cnt (dict))
1341 lose ((ME, _("%s: Number of variables associated with a value label (%d) "
1342 "is not between 1 and the number of variables (%d)."),
1343 fh_get_file_name (r->fh), n_vars, dict_get_var_cnt (dict)));
1345 /* Read the list of variables. */
1346 var = xnmalloc (n_vars, sizeof *var);
1347 for (i = 0; i < n_vars; i++)
1352 /* Read variable index, check range. */
1353 assertive_buf_read (r, &var_idx, sizeof var_idx, 0);
1354 if (r->reverse_endian)
1355 bswap_int32 (&var_idx);
1356 if (var_idx < 1 || var_idx > r->value_cnt)
1357 lose ((ME, _("%s: Variable index associated with value label (%d) is "
1358 "not between 1 and the number of values (%d)."),
1359 fh_get_file_name (r->fh), var_idx, r->value_cnt));
1361 /* Make sure it's a real variable. */
1362 v = var_by_idx[var_idx - 1];
1364 lose ((ME, _("%s: Variable index associated with value label (%d) "
1365 "refers to a continuation of a string variable, not to "
1366 "an actual variable."),
1367 fh_get_file_name (r->fh), var_idx));
1368 if (var_is_long_string (v))
1369 lose ((ME, _("%s: Value labels are not allowed on long string "
1371 fh_get_file_name (r->fh), var_get_name (v)));
1373 /* Add it to the list of variables. */
1377 /* Type check the variables. */
1378 for (i = 1; i < n_vars; i++)
1379 if (var_get_type (var[i]) != var_get_type (var[0]))
1380 lose ((ME, _("%s: Variables associated with value label are not all of "
1381 "identical type. Variable %s has %s type, but variable "
1383 fh_get_file_name (r->fh),
1384 var_get_name (var[0]),
1385 var_is_alpha (var[0]) ? _("string") : _("numeric"),
1386 var_get_name (var[i]),
1387 var_is_alpha (var[i]) ? _("string") : _("numeric")));
1389 /* Fill in labels[].value, now that we know the desired type. */
1390 for (i = 0; i < n_labels; i++)
1392 struct label *label = labels + i;
1394 if (var_is_alpha (var[0]))
1396 const int copy_len = MIN (sizeof label->raw_value,
1397 sizeof label->label);
1398 memcpy (label->value.s, label->raw_value, copy_len);
1401 assert (sizeof f == sizeof label->raw_value);
1402 memcpy (&f, label->raw_value, sizeof f);
1403 if (r->reverse_endian)
1409 /* Assign the value_label's to each variable. */
1410 for (i = 0; i < n_vars; i++)
1412 struct variable *v = var[i];
1415 /* Add each label to the variable. */
1416 for (j = 0; j < n_labels; j++)
1418 struct label *label = labels + j;
1419 if (var_add_value_label (v, &label->value, label->label))
1422 if (var_is_numeric (var[0]))
1423 msg (MW, _("%s: File contains duplicate label for value %g for "
1425 fh_get_file_name (r->fh), label->value.f, var_get_name (v));
1427 msg (MW, _("%s: File contains duplicate label for value `%.*s' "
1428 "for variable %s."),
1429 fh_get_file_name (r->fh), var_get_width (v),
1430 label->value.s, var_get_name (v));
1434 for (i = 0; i < n_labels; i++)
1435 free (labels[i].label);
1443 for (i = 0; i < n_labels; i++)
1444 free (labels[i].label);
1451 /* Reads BYTE_CNT bytes from the file represented by H. If BUF is
1452 non-NULL, uses that as the buffer; otherwise allocates at least
1453 MIN_ALLOC bytes. Returns a pointer to the buffer on success, NULL
1456 buf_read (struct sfm_reader *r, void *buf, size_t byte_cnt, size_t min_alloc)
1460 if (buf == NULL && byte_cnt > 0 )
1461 buf = xmalloc (MAX (byte_cnt, min_alloc));
1463 if ( byte_cnt == 0 )
1467 if (1 != fread (buf, byte_cnt, 1, r->file))
1469 if (ferror (r->file))
1470 msg (ME, _("%s: Reading system file: %s."),
1471 fh_get_file_name (r->fh), strerror (errno));
1473 corrupt_msg (ME, _("%s: Unexpected end of file."),
1474 fh_get_file_name (r->fh));
1482 /* Winds the reader BYTE_CNT bytes back in the reader stream. */
1484 buf_unread(struct sfm_reader *r, size_t byte_cnt)
1486 assert(byte_cnt > 0);
1488 if ( 0 != fseek(r->file, -byte_cnt, SEEK_CUR))
1490 msg (ME, _("%s: Seeking system file: %s."),
1491 fh_get_file_name (r->fh), strerror (errno));
1495 /* Reads a document record, type 6, from system file R, and sets up
1496 the documents and n_documents fields in the associated
1499 read_documents (struct sfm_reader *r, struct dictionary *dict)
1504 if (dict_get_documents (dict) != NULL)
1505 lose ((ME, _("%s: System file contains multiple "
1506 "type 6 (document) records."),
1507 fh_get_file_name (r->fh)));
1509 assertive_buf_read (r, &line_cnt, sizeof line_cnt, 0);
1511 lose ((ME, _("%s: Number of document lines (%ld) "
1512 "must be greater than 0."),
1513 fh_get_file_name (r->fh), (long) line_cnt));
1515 documents = buf_read (r, NULL, 80 * line_cnt, line_cnt * 80 + 1);
1516 /* FIXME? Run through asciify. */
1517 if (documents == NULL)
1519 documents[80 * line_cnt] = '\0';
1520 dict_set_documents (dict, documents);
1530 /* Reads compressed data into H->BUF and sets other pointers
1531 appropriately. Returns nonzero only if both no errors occur and
1534 buffer_input (struct sfm_reader *r)
1541 r->buf = xnmalloc (128, sizeof *r->buf);
1542 amt = fread (r->buf, sizeof *r->buf, 128, r->file);
1543 if (ferror (r->file))
1545 msg (ME, _("%s: Error reading file: %s."),
1546 fh_get_file_name (r->fh), strerror (errno));
1551 r->end = &r->buf[amt];
1555 /* Reads a single case consisting of compressed data from system
1556 file H into the array BUF[] according to reader R, and
1557 returns nonzero only if successful. */
1558 /* Data in system files is compressed in this manner. Data
1559 values are grouped into sets of eight ("octets"). Each value
1560 in an octet has one instruction byte that are output together.
1561 Each instruction byte gives a value for that byte or indicates
1562 that the value can be found following the instructions. */
1564 read_compressed_data (struct sfm_reader *r, flt64 *buf)
1566 const unsigned char *p_end = r->x + sizeof (flt64);
1567 unsigned char *p = r->y;
1569 const flt64 *buf_beg = buf;
1570 const flt64 *buf_end = &buf[r->value_cnt];
1574 for (; p < p_end; p++){
1578 /* Code 0 is ignored. */
1581 /* Code 252 is end of file. */
1584 lose ((ME, _("%s: Compressed data is corrupted. Data ends "
1585 "in partial case."),
1586 fh_get_file_name (r->fh)));
1588 /* Code 253 indicates that the value is stored explicitly
1589 following the instruction bytes. */
1590 if (r->ptr == NULL || r->ptr >= r->end)
1591 if (!buffer_input (r))
1592 lose ((ME, _("%s: Unexpected end of file."),
1593 fh_get_file_name (r->fh)));
1594 memcpy (buf++, r->ptr++, sizeof *buf);
1599 /* Code 254 indicates a string that is all blanks. */
1600 memset (buf++, ' ', sizeof *buf);
1605 /* Code 255 indicates the system-missing value. */
1607 if (r->reverse_endian)
1614 /* Codes 1 through 251 inclusive are taken to indicate a
1615 value of (BYTE - BIAS), where BYTE is the byte's value
1616 and BIAS is the compression bias (generally 100.0). */
1617 *buf = *p - r->bias;
1618 if (r->reverse_endian)
1626 /* We have reached the end of this instruction octet. Read
1628 if (r->ptr == NULL || r->ptr >= r->end)
1630 if (!buffer_input (r))
1633 lose ((ME, _("%s: Unexpected end of file."),
1634 fh_get_file_name (r->fh)));
1639 memcpy (r->x, r->ptr++, sizeof *buf);
1646 /* We have filled up an entire record. Update state and return
1657 /* Reads one case from READER's file into C. Returns nonzero
1658 only if successful. */
1660 sfm_read_case (struct sfm_reader *r, struct ccase *c)
1665 if (!r->compressed && sizeof (flt64) == sizeof (double) && ! r->has_vls)
1667 /* Fast path: external and internal representations are the
1668 same, except possibly for endianness or SYSMIS. Read
1669 directly into the case's buffer, then fix up any minor
1670 details as needed. */
1671 if (!fread_ok (r, case_data_all_rw (c),
1672 sizeof (union value) * r->value_cnt))
1675 /* Fix up endianness if needed. */
1676 if (r->reverse_endian)
1680 for (i = 0; i < r->var_cnt; i++)
1681 if (r->vars[i].width == 0)
1682 bswap_flt64 (&case_data_rw_idx (c, r->vars[i].fv)->f);
1685 /* Fix up SYSMIS values if needed.
1686 I don't think this will ever actually kick in, but it
1688 if (r->sysmis != SYSMIS)
1692 for (i = 0; i < r->var_cnt; i++)
1693 if (r->vars[i].width == 0 && case_num_idx (c, i) == r->sysmis)
1694 case_data_rw_idx (c, r->vars[i].fv)->f = SYSMIS;
1699 /* Slow path: internal and external representations differ.
1700 Read into a bounce buffer, then copy to C. */
1707 bounce_size = sizeof *bounce * r->value_cnt;
1708 bounce = bounce_cur = local_alloc (bounce_size);
1710 memset(bounce, 0, bounce_size);
1713 read_ok = fread_ok (r, bounce, bounce_size);
1715 read_ok = read_compressed_data (r, bounce);
1718 local_free (bounce);
1722 for (i = 0; i < r->var_cnt; i++)
1724 struct sfm_var *sv = &r->vars[i];
1728 flt64 f = *bounce_cur++;
1729 if (r->reverse_endian)
1731 case_data_rw_idx (c, sv->fv)->f = f == r->sysmis ? SYSMIS : f;
1735 flt64 *bc_start = bounce_cur;
1737 while (ofs < sv->width )
1739 const int chunk = MIN (MIN_VERY_LONG_STRING - 1,
1741 memcpy (case_data_rw_idx (c, sv->fv)->s + ofs,
1744 bounce_cur += DIV_RND_UP (chunk, sizeof (flt64));
1748 bounce_cur = bc_start + sfm_width_to_bytes (sv->width) / sizeof(flt64);
1752 local_free (bounce);
1758 fread_ok (struct sfm_reader *r, void *buffer, size_t byte_cnt)
1760 size_t read_bytes = fread (buffer, 1, byte_cnt, r->file);
1762 if (read_bytes == byte_cnt)
1766 if (ferror (r->file))
1768 msg (ME, _("%s: Reading system file: %s."),
1769 fh_get_file_name (r->fh), strerror (errno));
1772 else if (read_bytes != 0)
1774 msg (ME, _("%s: Partial record at end of system file."),
1775 fh_get_file_name (r->fh));
1782 /* Returns true if an I/O error has occurred on READER, false
1785 sfm_read_error (const struct sfm_reader *reader)
1790 /* Returns true if FILE is an SPSS system file,
1793 sfm_detect (FILE *file)
1795 struct sysfile_header hdr;
1797 if (fread (&hdr, sizeof hdr, 1, file) != 1)
1799 if (strncmp ("$FL2", hdr.rec_type, 4))