1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
4 Code for parsing floating-point numbers adapted from GNU C
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
33 #include "dictionary.h"
34 #include "file-handle.h"
41 #include "value-labels.h"
44 #include "debug-print.h"
46 /* Portable file reader. */
49 struct file_handle *fh; /* File handle. */
50 FILE *file; /* File stream. */
52 int weight_index; /* 0-based index of weight variable, or -1. */
54 unsigned char *trans; /* 256-byte character set translation table. */
56 int var_cnt; /* Number of variables. */
57 int *widths; /* Variable widths, 0 for numeric. */
58 int value_cnt; /* Number of `value's per case. */
60 unsigned char buf[83]; /* Input buffer. */
61 unsigned char *bp; /* Buffer pointer. */
62 int cc; /* Current character. */
66 corrupt_msg (struct pfm_reader *r, const char *format,...)
69 /* Displays a corruption error. */
71 corrupt_msg (struct pfm_reader *r, const char *format, ...)
78 va_start (args, format);
79 vsnprintf (buf, 1024, format, args);
89 getl_location (&e.where.filename, &e.where.line_number);
90 filename = handle_get_filename (r->fh);
91 e.title = title = local_alloc (strlen (filename) + 80);
92 sprintf (title, _("portable file %s corrupt at offset %ld: "),
93 filename, ftell (r->file) - (82 - (long) (r->bp - r->buf)));
104 static unsigned char * read_string (struct pfm_reader *r);
106 /* Closes a portable file after we're done with it. */
108 pfm_close_reader (struct pfm_reader *r)
116 fh_close (r->fh, "portable file", "rs");
117 if (fclose (r->file) == EOF)
118 msg (ME, _("%s: Closing portable file: %s."),
119 handle_get_filename (r->fh), strerror (errno));
125 /* Displays the message X with corrupt_msg, then jumps to the error
133 /* Read an 80-character line into handle H's buffer. Return
136 fill_buf (struct pfm_reader *r)
138 if (80 != fread (r->buf, 1, 80, r->file))
139 lose ((r, _("Unexpected end of file.")));
141 /* PORTME: line ends. */
146 if (c != '\n' && c != '\r')
147 lose ((r, _("Bad line end.")));
150 if (c != '\n' && c != '\r')
158 for (i = 0; i < 80; i++)
159 r->buf[i] = r->trans[r->buf[i]];
170 /* Read a single character into cur_char. Return success; */
172 read_char (struct pfm_reader *r)
174 if (r->bp >= &r->buf[80] && !fill_buf (r))
180 /* Advance a single character. */
183 if (!read_char (r)) \
187 /* Skip a single character if present, and return whether it was
190 skip_char (struct pfm_reader *r, int c)
201 /* Skip a single character if present, and return whether it was
203 #define match(C) skip_char (r, C)
205 static int read_header (struct pfm_reader *);
206 static int read_version_data (struct pfm_reader *, struct pfm_read_info *);
207 static int read_variables (struct pfm_reader *, struct dictionary *);
208 static int read_value_label (struct pfm_reader *, struct dictionary *);
209 void dump_dictionary (struct dictionary *);
211 /* Reads the dictionary from file with handle H, and returns it in a
212 dictionary structure. This dictionary may be modified in order to
213 rename, reorder, and delete variables, etc. */
215 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
216 struct pfm_read_info *info)
218 struct pfm_reader *r = NULL;
220 *dict = dict_create ();
221 if (!fh_open (fh, "portable file", "rs"))
224 /* Create and initialize reader. */
225 r = xmalloc (sizeof *r);
227 r->file = fopen (handle_get_filename (r->fh), "rb");
228 r->weight_index = -1;
235 /* Check that file open succeeded, prime reading. */
238 msg (ME, _("An error occurred while opening \"%s\" for reading "
239 "as a portable file: %s."),
240 handle_get_filename (r->fh), strerror (errno));
248 /* Read header, version, date info, product id, variables. */
250 || !read_version_data (r, info)
251 || !read_variables (r, *dict))
254 /* Read value labels. */
255 while (match (77 /* D */))
256 if (!read_value_label (r, *dict))
259 /* Check that we've made it to the data. */
260 if (!match (79 /* F */))
261 lose ((r, _("Data record expected.")));
266 pfm_close_reader (r);
267 dict_destroy (*dict);
272 /* Read a floating point value and return its value, or
273 second_lowest_value on error. */
275 read_float (struct pfm_reader *r)
283 /* Skip leading spaces. */
284 while (match (126 /* space */))
287 if (match (137 /* * */))
289 advance (); /* Probably a dot (.) but doesn't appear to matter. */
292 else if (match (141 /* - */))
297 if (r->cc >= 64 /* 0 */ && r->cc <= 93 /* T */)
301 /* Make sure that multiplication by 30 will not overflow. */
302 if (num > DBL_MAX * (1. / 30.))
303 /* The value of the digit doesn't matter, since we have already
304 gotten as many digits as can be represented in a `double'.
305 This doesn't necessarily mean the result will overflow.
306 The exponent may reduce it to within range.
308 We just need to record that there was another
309 digit so that we can multiply by 10 later. */
312 num = (num * 30.0) + (r->cc - 64);
314 /* Keep track of the number of digits after the decimal point.
315 If we just divided by 30 here, we would lose precision. */
319 else if (!got_dot && r->cc == 127 /* . */)
320 /* Record that we have found the decimal point. */
323 /* Any other character terminates the number. */
330 lose ((r, "Number expected."));
332 if (r->cc == 130 /* + */ || r->cc == 141 /* - */)
334 /* Get the exponent. */
336 int neg_exp = r->cc == 141 /* - */;
342 if (r->cc < 64 /* 0 */ || r->cc > 93 /* T */)
345 if (exp > LONG_MAX / 30)
347 exp = exp * 30 + (r->cc - 64);
350 /* We don't check whether there were actually any digits, but we
357 if (!match (142 /* / */))
358 lose ((r, _("Missing numeric terminator.")));
360 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
363 num *= pow (30.0, (double) exponent);
364 else if (exponent > 0)
366 if (num > DBL_MAX * pow (30.0, (double) -exponent))
368 num *= pow (30.0, (double) exponent);
378 return -DBL_MAX / 10.;
383 return second_lowest_value;
386 /* Read an integer and return its value, or NOT_INT on failure. */
388 read_int (struct pfm_reader *r)
390 double f = read_float (r);
392 if (f == second_lowest_value)
394 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
395 lose ((r, _("Bad integer format.")));
402 /* Reads a string and returns its value in a static buffer, or NULL on
403 failure. The buffer can be deallocated by calling with a NULL
405 static unsigned char *
406 read_string (struct pfm_reader *r)
417 else if (buf == NULL)
423 if (n < 0 || n > 255)
424 lose ((r, _("Bad string length %d."), n));
429 for (i = 0; i < n; i++)
443 /* Reads the 464-byte file header. */
445 read_header (struct pfm_reader *r)
447 /* For now at least, just ignore the vanity splash strings. */
451 for (i = 0; i < 200; i++)
456 unsigned char src[256];
460 for (i = 0; i < 256; i++)
462 src[i] = (unsigned char) r->cc;
466 for (i = 0; i < 256; i++)
469 /* 0 is used to mark untranslatable characters, so we have to mark
471 trans_temp[src[64]] = 64;
472 for (i = 0; i < 256; i++)
473 if (trans_temp[src[i]] == -1)
474 trans_temp[src[i]] = i;
476 r->trans = xmalloc (256);
477 for (i = 0; i < 256; i++)
478 r->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
480 /* Translate the input buffer. */
481 for (i = 0; i < 80; i++)
482 r->buf[i] = r->trans[r->buf[i]];
483 r->cc = r->trans[r->cc];
487 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
490 for (i = 0; i < 8; i++)
492 lose ((r, "Missing SPSSPORT signature."));
501 /* Reads the version and date info record, as well as product and
502 subproduct identification records if present. */
504 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
507 if (!match (74 /* A */))
508 lose ((r, "Unrecognized version code %d.", r->cc));
512 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
513 char *date = read_string (r);
518 if (strlen (date) != 8)
519 lose ((r, _("Bad date string length %d."), strlen (date)));
520 for (i = 0; i < 8; i++)
522 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
523 lose ((r, _("Bad character in date.")));
525 info->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
529 info->creation_date[2] = info->creation_date[5] = ' ';
530 info->creation_date[10] = 0;
536 static const int map[] = {0, 1, 3, 4, 6, 7};
537 char *time = read_string (r);
542 if (strlen (time) != 6)
543 lose ((r, _("Bad time string length %d."), strlen (time)));
544 for (i = 0; i < 6; i++)
546 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
547 lose ((r, _("Bad character in time.")));
549 info->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
553 info->creation_time[2] = info->creation_time[5] = ' ';
554 info->creation_time[8] = 0;
559 if (match (65 /* 1 */))
563 product = read_string (r);
567 strncpy (info->product, product, 61);
570 info->product[0] = 0;
573 if (match (67 /* 3 */))
577 subproduct = read_string (r);
578 if (subproduct == NULL)
581 strncpy (info->subproduct, subproduct, 61);
584 info->subproduct[0] = 0;
592 convert_format (struct pfm_reader *r, int fmt[3], struct fmt_spec *v,
595 v->type = translate_fmt (fmt[0]);
597 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
601 /* FIXME? Should verify the resulting specifier more thoroughly. */
604 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
605 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
606 lose ((r, _("%s variable %s has %s format specifier %s."),
607 vv->type == ALPHA ? _("String") : _("Numeric"),
609 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
610 formats[v->type].name));
617 /* Translation table from SPSS character code to this computer's
618 native character code (which is probably ASCII). */
619 static const unsigned char spss2ascii[256] =
622 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
623 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
627 /* Translate string S into ASCII. */
632 *s = spss2ascii[(unsigned char) *s];
635 static int parse_value (struct pfm_reader *, union value *, struct variable *);
637 /* Read information on all the variables. */
639 read_variables (struct pfm_reader *r, struct dictionary *dict)
641 char *weight_name = NULL;
644 if (!match (68 /* 4 */))
645 lose ((r, _("Expected variable count record.")));
647 r->var_cnt = read_int (r);
648 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
649 lose ((r, _("Invalid number of variables %d."), r->var_cnt));
650 r->widths = xmalloc (sizeof *r->widths * r->var_cnt);
652 /* Purpose of this value is unknown. It is typically 161. */
654 int x = read_int (r);
659 corrupt_msg (r, _("Unexpected flag value %d."), x);
662 if (match (70 /* 6 */))
664 weight_name = read_string (r);
668 asciify (weight_name);
669 if (strlen (weight_name) > 8)
671 corrupt_msg (r, _("Weight variable name (%s) truncated."),
673 weight_name[8] = '\0';
677 for (i = 0; i < r->var_cnt; i++)
685 if (!match (71 /* 7 */))
686 lose ((r, _("Expected variable record.")));
688 width = read_int (r);
689 if (width == NOT_INT)
692 lose ((r, _("Invalid variable width %d."), width));
693 r->widths[i] = width;
695 name = read_string (r);
698 for (j = 0; j < 6; j++)
700 fmt[j] = read_int (r);
701 if (fmt[j] == NOT_INT)
705 /* Verify first character of variable name.
707 Weirdly enough, there is no # character in the SPSS portable
708 character set, so we can't check for it. */
709 if (strlen (name) > 8)
710 lose ((r, _("position %d: Variable name has %u characters."),
712 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
713 && name[0] != 152 /* @ */)
714 lose ((r, _("position %d: Variable name begins with invalid "
716 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
718 corrupt_msg (r, _("position %d: Variable name begins with "
719 "lowercase letter %c."),
720 i, name[0] - 100 + 'a');
721 name[0] -= 26 /* a - A */;
724 /* Verify remaining characters of variable name. */
725 for (j = 1; j < (int) strlen (name); j++)
729 if (c >= 100 /* a */ && c <= 125 /* z */)
731 corrupt_msg (r, _("position %d: Variable name character %d "
732 "is lowercase letter %c."),
733 i, j + 1, c - 100 + 'a');
734 name[j] -= 26 /* z - Z */;
736 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
737 || c == 127 /* . */ || c == 152 /* @ */
738 || c == 136 /* $ */ || c == 146 /* _ */)
741 lose ((r, _("position %d: character `\\%03o' is not "
742 "valid in a variable name."), i, c));
746 if (width < 0 || width > 255)
747 lose ((r, "Bad width %d for variable %s.", width, name));
749 v = dict_create_var (dict, name, width);
751 lose ((r, _("Duplicate variable name %s."), name));
752 if (!convert_format (r, &fmt[0], &v->print, v))
754 if (!convert_format (r, &fmt[3], &v->write, v))
757 /* Range missing values. */
758 if (match (75 /* B */))
760 v->miss_type = MISSING_RANGE;
761 if (!parse_value (r, &v->missing[0], v)
762 || !parse_value (r, &v->missing[1], v))
765 else if (match (74 /* A */))
767 v->miss_type = MISSING_HIGH;
768 if (!parse_value (r, &v->missing[0], v))
771 else if (match (73 /* 9 */))
773 v->miss_type = MISSING_LOW;
774 if (!parse_value (r, &v->missing[0], v))
778 /* Single missing values. */
779 while (match (72 /* 8 */))
781 static const int map_next[MISSING_COUNT] =
783 MISSING_1, MISSING_2, MISSING_3, -1,
784 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
788 static const int map_ofs[MISSING_COUNT] =
790 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
793 v->miss_type = map_next[v->miss_type];
794 if (v->miss_type == -1)
795 lose ((r, _("Bad missing values for %s."), v->name));
797 assert (map_ofs[v->miss_type] != -1);
798 if (!parse_value (r, &v->missing[map_ofs[v->miss_type]], v))
802 if (match (76 /* C */))
804 char *label = read_string (r);
809 v->label = xstrdup (label);
814 if (weight_name != NULL)
816 struct variable *weight_var = dict_lookup_var (dict, weight_name);
817 if (weight_var == NULL)
818 lose ((r, _("Weighting variable %s not present in dictionary."),
822 dict_set_weight (dict, weight_var);
832 /* Parse a value for variable VV into value V. Returns success. */
834 parse_value (struct pfm_reader *r, union value *v, struct variable *vv)
836 if (vv->type == ALPHA)
838 char *mv = read_string (r);
844 strncpy (v->s, mv, 8);
845 for (j = 0; j < 8; j++)
847 v->s[j] = spss2ascii[v->s[j]];
849 /* Value labels are always padded with spaces. */
854 v->f = read_float (r);
855 if (v->f == second_lowest_value)
862 /* Parse a value label record and return success. */
864 read_value_label (struct pfm_reader *r, struct dictionary *dict)
879 v = xmalloc (sizeof *v * nv);
880 for (i = 0; i < nv; i++)
882 char *name = read_string (r);
887 v[i] = dict_lookup_var (dict, name);
889 lose ((r, _("Unknown variable %s while parsing value labels."), name));
891 if (v[0]->width != v[i]->width)
892 lose ((r, _("Cannot assign value labels to %s and %s, which "
893 "have different variable types or widths."),
894 v[0]->name, v[i]->name));
897 n_labels = read_int (r);
898 if (n_labels == NOT_INT)
901 for (i = 0; i < n_labels; i++)
908 if (!parse_value (r, &val, v[0]))
911 label = read_string (r);
916 /* Assign the value_label's to each variable. */
917 for (j = 0; j < nv; j++)
919 struct variable *var = v[j];
921 if (!val_labs_replace (var->val_labs, val, label))
924 if (var->type == NUMERIC)
925 lose ((r, _("Duplicate label for value %g for variable %s."),
928 lose ((r, _("Duplicate label for value `%.*s' for variable %s."),
929 var->width, val.s, var->name));
940 /* Reads one case from portable file R into C. Returns nonzero
941 only if successful. */
943 pfm_read_case (struct pfm_reader *r, struct ccase *c)
948 /* Check for end of file. */
949 if (r->cc == 99 /* Z */)
953 for (i = 0; i < r->var_cnt; i++)
955 int width = r->widths[i];
959 double f = read_float (r);
960 if (f == second_lowest_value)
963 case_data_rw (c, idx)->f = f;
968 char *s = read_string (r);
973 st_bare_pad_copy (case_data_rw (c, idx)->s, s, width);
974 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
981 lose ((r, _("End of file midway through case.")));