1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
31 #include "file-handle.h"
38 #include "value-labels.h"
41 #include "debug-print.h"
43 /* pfm's file_handle extension. */
46 FILE *file; /* Actual file. */
48 struct dictionary *dict; /* File's dictionary. */
49 int weight_index; /* 0-based index of weight variable, or -1. */
51 unsigned char *trans; /* 256-byte character set translation table. */
53 int nvars; /* Number of variables. */
54 int *vars; /* Variable widths, 0 for numeric. */
55 int case_size; /* Number of `value's per case. */
57 unsigned char buf[83]; /* Input buffer. */
58 unsigned char *bp; /* Buffer pointer. */
59 int cc; /* Current character. */
62 static struct fh_ext_class pfm_r_class;
65 corrupt_msg (struct file_handle *h, const char *format,...)
68 /* Displays a corruption error. */
70 corrupt_msg (struct file_handle *h, const char *format, ...)
72 struct pfm_fhuser_ext *ext = h->ext;
78 va_start (args, format);
79 vsnprintf (buf, 1024, format, args);
89 getl_location (&e.where.filename, &e.where.line_number);
90 filename = handle_get_filename (h);
91 e.title = title = local_alloc (strlen (filename) + 80);
92 sprintf (title, _("portable file %s corrupt at offset %ld: "),
93 filename, ftell (ext->file) - (82 - (long) (ext->bp - ext->buf)));
104 /* Closes a portable file after we're done with it. */
106 pfm_close (struct file_handle *h)
108 struct pfm_fhuser_ext *ext = h->ext;
110 if (EOF == fclose (ext->file))
111 msg (ME, _("%s: Closing portable file: %s."),
112 handle_get_filename (h), strerror (errno));
118 /* Displays the message X with corrupt_msg, then jumps to the lossage
128 /* Read an 80-character line into handle H's buffer. Return
131 fill_buf (struct file_handle *h)
133 struct pfm_fhuser_ext *ext = h->ext;
135 if (80 != fread (ext->buf, 1, 80, ext->file))
136 lose ((h, _("Unexpected end of file.")));
138 /* PORTME: line ends. */
142 c = getc (ext->file);
143 if (c != '\n' && c != '\r')
144 lose ((h, _("Bad line end.")));
146 c = getc (ext->file);
147 if (c != '\n' && c != '\r')
148 ungetc (c, ext->file);
155 for (i = 0; i < 80; i++)
156 ext->buf[i] = ext->trans[ext->buf[i]];
167 /* Read a single character into cur_char. Return success; */
169 read_char (struct file_handle *h)
171 struct pfm_fhuser_ext *ext = h->ext;
173 if (ext->bp >= &ext->buf[80] && !fill_buf (h))
175 ext->cc = *ext->bp++;
179 /* Advance a single character. */
180 #define advance() if (!read_char (h)) goto lossage
182 /* Skip a single character if present, and return whether it was
185 skip_char (struct file_handle *h, int c)
187 struct pfm_fhuser_ext *ext = h->ext;
198 /* Skip a single character if present, and return whether it was
200 #define match(C) skip_char (h, C)
202 static int read_header (struct file_handle *h);
203 static int read_version_data (struct file_handle *h, struct pfm_read_info *inf);
204 static int read_variables (struct file_handle *h);
205 static int read_value_label (struct file_handle *h);
206 void dump_dictionary (struct dictionary *dict);
208 /* Reads the dictionary from file with handle H, and returns it in a
209 dictionary structure. This dictionary may be modified in order to
210 rename, reorder, and delete variables, etc. */
212 pfm_read_dictionary (struct file_handle *h, struct pfm_read_info *inf)
214 /* The file handle extension record. */
215 struct pfm_fhuser_ext *ext;
217 /* Check whether the file is already open. */
218 if (h->class == &pfm_r_class)
223 else if (h->class != NULL)
225 msg (ME, _("Cannot read file %s as portable file: already opened "
227 handle_get_name (h), h->class->name);
231 msg (VM (1), _("%s: Opening portable-file handle %s for reading."),
232 handle_get_filename (h), handle_get_name (h));
234 /* Open the physical disk file. */
235 ext = xmalloc (sizeof (struct pfm_fhuser_ext));
236 ext->file = fopen (handle_get_filename (h), "rb");
237 if (ext->file == NULL)
239 msg (ME, _("An error occurred while opening \"%s\" for reading "
240 "as a portable file: %s."),
241 handle_get_filename (h), strerror (errno));
247 /* Initialize the sfm_fhuser_ext structure. */
248 h->class = &pfm_r_class;
256 /* Read the header. */
257 if (!read_header (h))
260 /* Read version, date info, product identification. */
261 if (!read_version_data (h, inf))
264 /* Read variables. */
265 if (!read_variables (h))
269 while (match (77 /* D */))
270 if (!read_value_label (h))
273 if (!match (79 /* F */))
274 lose ((h, _("Data record expected.")));
276 msg (VM (2), _("Read portable-file dictionary successfully."));
281 /* Come here on unsuccessful completion. */
282 msg (VM (1), _("Error reading portable-file dictionary."));
285 if (ext && ext->dict)
286 dict_destroy (ext->dict);
293 /* Read a floating point value and return its value, or
294 second_lowest_value on error. */
296 read_float (struct file_handle *h)
298 struct pfm_fhuser_ext *ext = h->ext;
305 /* Skip leading spaces. */
306 while (match (126 /* space */))
309 if (match (137 /* * */))
311 advance (); /* Probably a dot (.) but doesn't appear to matter. */
314 else if (match (141 /* - */))
319 if (ext->cc >= 64 /* 0 */ && ext->cc <= 93 /* T */)
323 /* Make sure that multiplication by 30 will not overflow. */
324 if (num > DBL_MAX * (1. / 30.))
325 /* The value of the digit doesn't matter, since we have already
326 gotten as many digits as can be represented in a `double'.
327 This doesn't necessarily mean the result will overflow.
328 The exponent may reduce it to within range.
330 We just need to record that there was another
331 digit so that we can multiply by 10 later. */
334 num = (num * 30.0) + (ext->cc - 64);
336 /* Keep track of the number of digits after the decimal point.
337 If we just divided by 30 here, we would lose precision. */
341 else if (!got_dot && ext->cc == 127 /* . */)
342 /* Record that we have found the decimal point. */
345 /* Any other character terminates the number. */
352 lose ((h, "Number expected."));
354 if (ext->cc == 130 /* + */ || ext->cc == 141 /* - */)
356 /* Get the exponent. */
358 int neg_exp = ext->cc == 141 /* - */;
364 if (ext->cc < 64 /* 0 */ || ext->cc > 93 /* T */)
367 if (exp > LONG_MAX / 30)
369 exp = exp * 30 + (ext->cc - 64);
372 /* We don't check whether there were actually any digits, but we
379 if (!match (142 /* / */))
380 lose ((h, _("Missing numeric terminator.")));
382 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
385 num *= pow (30.0, (double) exponent);
386 else if (exponent > 0)
388 if (num > DBL_MAX * pow (30.0, (double) -exponent))
390 num *= pow (30.0, (double) exponent);
400 return -DBL_MAX / 10.;
405 return second_lowest_value;
408 /* Read an integer and return its value, or NOT_INT on failure. */
410 read_int (struct file_handle *h)
412 double f = read_float (h);
414 if (f == second_lowest_value)
416 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
417 lose ((h, _("Bad integer format.")));
424 /* Reads a string and returns its value in a static buffer, or NULL on
425 failure. The buffer can be deallocated by calling with a NULL
427 static unsigned char *
428 read_string (struct file_handle *h)
430 struct pfm_fhuser_ext *ext = h->ext;
440 else if (buf == NULL)
446 if (n < 0 || n > 255)
447 lose ((h, _("Bad string length %d."), n));
452 for (i = 0; i < n; i++)
466 /* Reads the 464-byte file header. */
468 read_header (struct file_handle *h)
470 struct pfm_fhuser_ext *ext = h->ext;
472 /* For now at least, just ignore the vanity splash strings. */
476 for (i = 0; i < 200; i++)
481 unsigned char src[256];
485 for (i = 0; i < 256; i++)
487 src[i] = (unsigned char) ext->cc;
491 for (i = 0; i < 256; i++)
494 /* 0 is used to mark untranslatable characters, so we have to mark
496 trans_temp[src[64]] = 64;
497 for (i = 0; i < 256; i++)
498 if (trans_temp[src[i]] == -1)
499 trans_temp[src[i]] = i;
501 ext->trans = xmalloc (256);
502 for (i = 0; i < 256; i++)
503 ext->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
505 /* Translate the input buffer. */
506 for (i = 0; i < 80; i++)
507 ext->buf[i] = ext->trans[ext->buf[i]];
508 ext->cc = ext->trans[ext->cc];
512 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
515 for (i = 0; i < 8; i++)
517 lose ((h, "Missing SPSSPORT signature."));
526 /* Reads the version and date info record, as well as product and
527 subproduct identification records if present. */
529 read_version_data (struct file_handle *h, struct pfm_read_info *inf)
531 struct pfm_fhuser_ext *ext = h->ext;
534 if (!match (74 /* A */))
535 lose ((h, "Unrecognized version code %d.", ext->cc));
539 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
540 char *date = read_string (h);
545 if (strlen (date) != 8)
546 lose ((h, _("Bad date string length %d."), strlen (date)));
547 for (i = 0; i < 8; i++)
549 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
550 lose ((h, _("Bad character in date.")));
552 inf->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
556 inf->creation_date[2] = inf->creation_date[5] = ' ';
557 inf->creation_date[10] = 0;
563 static const int map[] = {0, 1, 3, 4, 6, 7};
564 char *time = read_string (h);
569 if (strlen (time) != 6)
570 lose ((h, _("Bad time string length %d."), strlen (time)));
571 for (i = 0; i < 6; i++)
573 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
574 lose ((h, _("Bad character in time.")));
576 inf->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
580 inf->creation_time[2] = inf->creation_time[5] = ' ';
581 inf->creation_time[8] = 0;
586 if (match (65 /* 1 */))
590 product = read_string (h);
594 strncpy (inf->product, product, 61);
600 if (match (67 /* 3 */))
604 subproduct = read_string (h);
605 if (subproduct == NULL)
608 strncpy (inf->subproduct, subproduct, 61);
611 inf->subproduct[0] = 0;
619 convert_format (struct file_handle *h, int fmt[3], struct fmt_spec *v,
622 v->type = translate_fmt (fmt[0]);
624 lose ((h, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
628 /* FIXME? Should verify the resulting specifier more thoroughly. */
631 lose ((h, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
632 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
633 lose ((h, _("%s variable %s has %s format specifier %s."),
634 vv->type == ALPHA ? _("String") : _("Numeric"),
636 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
637 formats[v->type].name));
644 /* Translation table from SPSS character code to this computer's
645 native character code (which is probably ASCII). */
646 static const unsigned char spss2ascii[256] =
649 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
650 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
654 /* Translate string S into ASCII. */
659 *s = spss2ascii[(unsigned char) *s];
662 static int parse_value (struct file_handle *, union value *, struct variable *);
664 /* Read information on all the variables. */
666 read_variables (struct file_handle *h)
668 struct pfm_fhuser_ext *ext = h->ext;
669 char *weight_name = NULL;
672 if (!match (68 /* 4 */))
673 lose ((h, _("Expected variable count record.")));
675 ext->nvars = read_int (h);
676 if (ext->nvars <= 0 || ext->nvars == NOT_INT)
677 lose ((h, _("Invalid number of variables %d."), ext->nvars));
678 ext->vars = xmalloc (sizeof *ext->vars * ext->nvars);
680 /* Purpose of this value is unknown. It is typically 161. */
682 int x = read_int (h);
687 corrupt_msg (h, _("Unexpected flag value %d."), x);
690 ext->dict = dict_create ();
692 if (match (70 /* 6 */))
694 weight_name = read_string (h);
698 asciify (weight_name);
699 if (strlen (weight_name) > 8)
701 corrupt_msg (h, _("Weight variable name (%s) truncated."),
703 weight_name[8] = '\0';
707 for (i = 0; i < ext->nvars; i++)
715 if (!match (71 /* 7 */))
716 lose ((h, _("Expected variable record.")));
718 width = read_int (h);
719 if (width == NOT_INT)
722 lose ((h, _("Invalid variable width %d."), width));
723 ext->vars[i] = width;
725 name = read_string (h);
728 for (j = 0; j < 6; j++)
730 fmt[j] = read_int (h);
731 if (fmt[j] == NOT_INT)
735 /* Verify first character of variable name.
737 Weirdly enough, there is no # character in the SPSS portable
738 character set, so we can't check for it. */
739 if (strlen (name) > 8)
740 lose ((h, _("position %d: Variable name has %u characters."),
742 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
743 && name[0] != 152 /* @ */)
744 lose ((h, _("position %d: Variable name begins with invalid "
746 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
748 corrupt_msg (h, _("position %d: Variable name begins with "
749 "lowercase letter %c."),
750 i, name[0] - 100 + 'a');
751 name[0] -= 26 /* a - A */;
754 /* Verify remaining characters of variable name. */
755 for (j = 1; j < (int) strlen (name); j++)
759 if (c >= 100 /* a */ && c <= 125 /* z */)
761 corrupt_msg (h, _("position %d: Variable name character %d "
762 "is lowercase letter %c."),
763 i, j + 1, c - 100 + 'a');
764 name[j] -= 26 /* z - Z */;
766 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
767 || c == 127 /* . */ || c == 152 /* @ */
768 || c == 136 /* $ */ || c == 146 /* _ */)
771 lose ((h, _("position %d: character `\\%03o' is not "
772 "valid in a variable name."), i, c));
776 if (width < 0 || width > 255)
777 lose ((h, "Bad width %d for variable %s.", width, name));
779 v = dict_create_var (ext->dict, name, width);
782 lose ((h, _("Duplicate variable name %s."), name));
783 if (!convert_format (h, &fmt[0], &v->print, v))
785 if (!convert_format (h, &fmt[3], &v->write, v))
788 /* Range missing values. */
789 if (match (75 /* B */))
791 v->miss_type = MISSING_RANGE;
792 if (!parse_value (h, &v->missing[0], v)
793 || !parse_value (h, &v->missing[1], v))
796 else if (match (74 /* A */))
798 v->miss_type = MISSING_HIGH;
799 if (!parse_value (h, &v->missing[0], v))
802 else if (match (73 /* 9 */))
804 v->miss_type = MISSING_LOW;
805 if (!parse_value (h, &v->missing[0], v))
809 /* Single missing values. */
810 while (match (72 /* 8 */))
812 static const int map_next[MISSING_COUNT] =
814 MISSING_1, MISSING_2, MISSING_3, -1,
815 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
819 static const int map_ofs[MISSING_COUNT] =
821 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
824 v->miss_type = map_next[v->miss_type];
825 if (v->miss_type == -1)
826 lose ((h, _("Bad missing values for %s."), v->name));
828 assert (map_ofs[v->miss_type] != -1);
829 if (!parse_value (h, &v->missing[map_ofs[v->miss_type]], v))
833 if (match (76 /* C */))
835 char *label = read_string (h);
840 v->label = xstrdup (label);
845 if (weight_name != NULL)
847 struct variable *weight_var = dict_lookup_var (ext->dict, weight_name);
848 if (weight_var == NULL)
849 lose ((h, _("Weighting variable %s not present in dictionary."),
853 dict_set_weight (ext->dict, weight_var);
863 /* Parse a value for variable VV into value V. Returns success. */
865 parse_value (struct file_handle *h, union value *v, struct variable *vv)
867 if (vv->type == ALPHA)
869 char *mv = read_string (h);
875 strncpy (v->s, mv, 8);
876 for (j = 0; j < 8; j++)
878 v->s[j] = spss2ascii[v->s[j]];
880 /* Value labels are always padded with spaces. */
885 v->f = read_float (h);
886 if (v->f == second_lowest_value)
893 /* Parse a value label record and return success. */
895 read_value_label (struct file_handle *h)
897 struct pfm_fhuser_ext *ext = h->ext;
912 v = xmalloc (sizeof *v * nv);
913 for (i = 0; i < nv; i++)
915 char *name = read_string (h);
920 v[i] = dict_lookup_var (ext->dict, name);
922 lose ((h, _("Unknown variable %s while parsing value labels."), name));
924 if (v[0]->width != v[i]->width)
925 lose ((h, _("Cannot assign value labels to %s and %s, which "
926 "have different variable types or widths."),
927 v[0]->name, v[i]->name));
930 n_labels = read_int (h);
931 if (n_labels == NOT_INT)
934 for (i = 0; i < n_labels; i++)
941 if (!parse_value (h, &val, v[0]))
944 label = read_string (h);
949 /* Assign the value_label's to each variable. */
950 for (j = 0; j < nv; j++)
952 struct variable *var = v[j];
954 if (!val_labs_replace (var->val_labs, val, label))
957 if (var->type == NUMERIC)
958 lose ((h, _("Duplicate label for value %g for variable %s."),
961 lose ((h, _("Duplicate label for value `%.*s' for variable %s."),
962 var->width, val.s, var->name));
973 /* Reads one case from portable file H into PERM
974 according to the instuctions given in associated dictionary DICT,
975 which must have the get.fv elements appropriately set. Returns
976 nonzero only if successful. */
978 pfm_read_case (struct file_handle *h, struct ccase *perm,
979 struct dictionary *dict)
981 struct pfm_fhuser_ext *ext = h->ext;
983 union value *temp, *tp;
986 /* Check for end of file. */
987 if (ext->cc == 99 /* Z */)
990 /* The first concern is to obtain a full case relative to the data
991 file. (Cases in the data file have no particular relationship to
992 cases in the active file.) */
993 tp = temp = local_alloc (sizeof *tp * ext->case_size);
994 for (tp = temp, i = 0; i < ext->nvars; i++)
995 if (ext->vars[i] == 0)
997 tp->f = read_float (h);
998 if (tp->f == second_lowest_value)
1004 char *s = read_string (h);
1006 goto unexpected_eof;
1009 st_bare_pad_copy (tp->s, s, ext->vars[i]);
1010 tp += DIV_RND_UP (ext->vars[i], MAX_SHORT_STRING);
1013 /* Translate a case in data file format to a case in active file
1015 for (i = 0; i < dict_get_var_cnt (dict); i++)
1017 struct variable *v = dict_get_var (dict, i);
1019 if (v->get.fv == -1)
1022 if (v->type == NUMERIC)
1023 case_data_rw (perm, v->fv)->f = temp[v->get.fv].f;
1025 memcpy (case_data_rw (perm, v->fv)->s, &temp[v->get.fv], v->width);
1032 lose ((h, _("End of file midway through case.")));
1039 static struct fh_ext_class pfm_r_class =
1042 N_("reading as a portable file"),