1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
4 Code for parsing floating-point numbers adapted from GNU C
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
33 #include "dictionary.h"
34 #include "file-handle.h"
41 #include "value-labels.h"
44 #include "debug-print.h"
46 /* Portable file reader. */
49 struct file_handle *fh; /* File handle. */
50 FILE *file; /* File stream. */
52 int weight_index; /* 0-based index of weight variable, or -1. */
54 unsigned char *trans; /* 256-byte character set translation table. */
56 int var_cnt; /* Number of variables. */
57 int *widths; /* Variable widths, 0 for numeric. */
58 int value_cnt; /* Number of `value's per case. */
60 unsigned char buf[83]; /* Input buffer. */
61 unsigned char *bp; /* Buffer pointer. */
62 int cc; /* Current character. */
66 corrupt_msg (struct pfm_reader *r, const char *format,...)
69 /* Displays a corruption error. */
71 corrupt_msg (struct pfm_reader *r, const char *format, ...)
79 getl_location (&e.where.filename, &e.where.line_number);
80 filename = handle_get_filename (r->fh);
81 e.title = title = local_alloc (strlen (filename) + 80);
82 sprintf (title, _("portable file %s corrupt at offset %ld: "),
83 filename, ftell (r->file) - (82 - (long) (r->bp - r->buf)));
85 va_start (args, format);
86 err_vmsg (&e, format, args);
94 static unsigned char * read_string (struct pfm_reader *r);
96 /* Closes a portable file after we're done with it. */
98 pfm_close_reader (struct pfm_reader *r)
106 fh_close (r->fh, "portable file", "rs");
107 if (fclose (r->file) == EOF)
108 msg (ME, _("%s: Closing portable file: %s."),
109 handle_get_filename (r->fh), strerror (errno));
115 /* Displays the message X with corrupt_msg, then jumps to the error
123 /* Read an 80-character line into handle H's buffer. Return
126 fill_buf (struct pfm_reader *r)
128 if (80 != fread (r->buf, 1, 80, r->file))
129 lose ((r, _("Unexpected end of file.")));
131 /* PORTME: line ends. */
136 if (c != '\n' && c != '\r')
137 lose ((r, _("Bad line end.")));
140 if (c != '\n' && c != '\r')
148 for (i = 0; i < 80; i++)
149 r->buf[i] = r->trans[r->buf[i]];
160 /* Read a single character into cur_char. Return success; */
162 read_char (struct pfm_reader *r)
164 if (r->bp >= &r->buf[80] && !fill_buf (r))
170 /* Advance a single character. */
173 if (!read_char (r)) \
177 /* Skip a single character if present, and return whether it was
180 skip_char (struct pfm_reader *r, int c)
191 /* Skip a single character if present, and return whether it was
193 #define match(C) skip_char (r, C)
195 static int read_header (struct pfm_reader *);
196 static int read_version_data (struct pfm_reader *, struct pfm_read_info *);
197 static int read_variables (struct pfm_reader *, struct dictionary *);
198 static int read_value_label (struct pfm_reader *, struct dictionary *);
199 void dump_dictionary (struct dictionary *);
201 /* Reads the dictionary from file with handle H, and returns it in a
202 dictionary structure. This dictionary may be modified in order to
203 rename, reorder, and delete variables, etc. */
205 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
206 struct pfm_read_info *info)
208 struct pfm_reader *r = NULL;
210 *dict = dict_create ();
211 if (!fh_open (fh, "portable file", "rs"))
214 /* Create and initialize reader. */
215 r = xmalloc (sizeof *r);
217 r->file = fopen (handle_get_filename (r->fh), "rb");
218 r->weight_index = -1;
225 /* Check that file open succeeded, prime reading. */
228 msg (ME, _("An error occurred while opening \"%s\" for reading "
229 "as a portable file: %s."),
230 handle_get_filename (r->fh), strerror (errno));
238 /* Read header, version, date info, product id, variables. */
240 || !read_version_data (r, info)
241 || !read_variables (r, *dict))
244 /* Read value labels. */
245 while (match (77 /* D */))
246 if (!read_value_label (r, *dict))
249 /* Check that we've made it to the data. */
250 if (!match (79 /* F */))
251 lose ((r, _("Data record expected.")));
256 pfm_close_reader (r);
257 dict_destroy (*dict);
262 /* Read a floating point value and return its value, or
263 second_lowest_value on error. */
265 read_float (struct pfm_reader *r)
273 /* Skip leading spaces. */
274 while (match (126 /* space */))
277 if (match (137 /* * */))
279 advance (); /* Probably a dot (.) but doesn't appear to matter. */
282 else if (match (141 /* - */))
287 if (r->cc >= 64 /* 0 */ && r->cc <= 93 /* T */)
291 /* Make sure that multiplication by 30 will not overflow. */
292 if (num > DBL_MAX * (1. / 30.))
293 /* The value of the digit doesn't matter, since we have already
294 gotten as many digits as can be represented in a `double'.
295 This doesn't necessarily mean the result will overflow.
296 The exponent may reduce it to within range.
298 We just need to record that there was another
299 digit so that we can multiply by 10 later. */
302 num = (num * 30.0) + (r->cc - 64);
304 /* Keep track of the number of digits after the decimal point.
305 If we just divided by 30 here, we would lose precision. */
309 else if (!got_dot && r->cc == 127 /* . */)
310 /* Record that we have found the decimal point. */
313 /* Any other character terminates the number. */
320 lose ((r, "Number expected."));
322 if (r->cc == 130 /* + */ || r->cc == 141 /* - */)
324 /* Get the exponent. */
326 int neg_exp = r->cc == 141 /* - */;
332 if (r->cc < 64 /* 0 */ || r->cc > 93 /* T */)
335 if (exp > LONG_MAX / 30)
337 exp = exp * 30 + (r->cc - 64);
340 /* We don't check whether there were actually any digits, but we
347 if (!match (142 /* / */))
348 lose ((r, _("Missing numeric terminator.")));
350 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
353 num *= pow (30.0, (double) exponent);
354 else if (exponent > 0)
356 if (num > DBL_MAX * pow (30.0, (double) -exponent))
358 num *= pow (30.0, (double) exponent);
368 return -DBL_MAX / 10.;
373 return second_lowest_value;
376 /* Read an integer and return its value, or NOT_INT on failure. */
378 read_int (struct pfm_reader *r)
380 double f = read_float (r);
382 if (f == second_lowest_value)
384 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
385 lose ((r, _("Bad integer format.")));
392 /* Reads a string and returns its value in a static buffer, or NULL on
393 failure. The buffer can be deallocated by calling with a NULL
395 static unsigned char *
396 read_string (struct pfm_reader *r)
407 else if (buf == NULL)
413 if (n < 0 || n > 255)
414 lose ((r, _("Bad string length %d."), n));
419 for (i = 0; i < n; i++)
433 /* Reads the 464-byte file header. */
435 read_header (struct pfm_reader *r)
437 /* For now at least, just ignore the vanity splash strings. */
441 for (i = 0; i < 200; i++)
446 unsigned char src[256];
450 for (i = 0; i < 256; i++)
452 src[i] = (unsigned char) r->cc;
456 for (i = 0; i < 256; i++)
459 /* 0 is used to mark untranslatable characters, so we have to mark
461 trans_temp[src[64]] = 64;
462 for (i = 0; i < 256; i++)
463 if (trans_temp[src[i]] == -1)
464 trans_temp[src[i]] = i;
466 r->trans = xmalloc (256);
467 for (i = 0; i < 256; i++)
468 r->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
470 /* Translate the input buffer. */
471 for (i = 0; i < 80; i++)
472 r->buf[i] = r->trans[r->buf[i]];
473 r->cc = r->trans[r->cc];
477 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
480 for (i = 0; i < 8; i++)
482 lose ((r, "Missing SPSSPORT signature."));
491 /* Reads the version and date info record, as well as product and
492 subproduct identification records if present. */
494 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
497 if (!match (74 /* A */))
498 lose ((r, "Unrecognized version code %d.", r->cc));
502 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
503 char *date = read_string (r);
508 if (strlen (date) != 8)
509 lose ((r, _("Bad date string length %d."), strlen (date)));
510 for (i = 0; i < 8; i++)
512 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
513 lose ((r, _("Bad character in date.")));
515 info->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
519 info->creation_date[2] = info->creation_date[5] = ' ';
520 info->creation_date[10] = 0;
526 static const int map[] = {0, 1, 3, 4, 6, 7};
527 char *time = read_string (r);
532 if (strlen (time) != 6)
533 lose ((r, _("Bad time string length %d."), strlen (time)));
534 for (i = 0; i < 6; i++)
536 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
537 lose ((r, _("Bad character in time.")));
539 info->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
543 info->creation_time[2] = info->creation_time[5] = ' ';
544 info->creation_time[8] = 0;
549 if (match (65 /* 1 */))
553 product = read_string (r);
557 strncpy (info->product, product, 61);
560 info->product[0] = 0;
563 if (match (67 /* 3 */))
567 subproduct = read_string (r);
568 if (subproduct == NULL)
571 strncpy (info->subproduct, subproduct, 61);
574 info->subproduct[0] = 0;
582 convert_format (struct pfm_reader *r, int fmt[3], struct fmt_spec *v,
585 v->type = translate_fmt (fmt[0]);
587 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
591 /* FIXME? Should verify the resulting specifier more thoroughly. */
594 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
595 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
596 lose ((r, _("%s variable %s has %s format specifier %s."),
597 vv->type == ALPHA ? _("String") : _("Numeric"),
599 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
600 formats[v->type].name));
607 /* Translation table from SPSS character code to this computer's
608 native character code (which is probably ASCII). */
609 static const unsigned char spss2ascii[256] =
612 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
613 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
617 /* Translate string S into ASCII. */
622 *s = spss2ascii[(unsigned char) *s];
625 static int parse_value (struct pfm_reader *, union value *, struct variable *);
627 /* Read information on all the variables. */
629 read_variables (struct pfm_reader *r, struct dictionary *dict)
631 char *weight_name = NULL;
634 if (!match (68 /* 4 */))
635 lose ((r, _("Expected variable count record.")));
637 r->var_cnt = read_int (r);
638 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
639 lose ((r, _("Invalid number of variables %d."), r->var_cnt));
640 r->widths = xmalloc (sizeof *r->widths * r->var_cnt);
642 /* Purpose of this value is unknown. It is typically 161. */
644 int x = read_int (r);
649 corrupt_msg (r, _("Unexpected flag value %d."), x);
652 if (match (70 /* 6 */))
654 weight_name = read_string (r);
658 asciify (weight_name);
659 if (strlen (weight_name) > 8)
661 corrupt_msg (r, _("Weight variable name (%s) truncated."),
663 weight_name[8] = '\0';
667 for (i = 0; i < r->var_cnt; i++)
675 if (!match (71 /* 7 */))
676 lose ((r, _("Expected variable record.")));
678 width = read_int (r);
679 if (width == NOT_INT)
682 lose ((r, _("Invalid variable width %d."), width));
683 r->widths[i] = width;
685 name = read_string (r);
688 for (j = 0; j < 6; j++)
690 fmt[j] = read_int (r);
691 if (fmt[j] == NOT_INT)
695 /* Verify first character of variable name.
697 Weirdly enough, there is no # character in the SPSS portable
698 character set, so we can't check for it. */
699 if (strlen (name) > 8)
700 lose ((r, _("position %d: Variable name has %u characters."),
702 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
703 && name[0] != 152 /* @ */)
704 lose ((r, _("position %d: Variable name begins with invalid "
706 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
708 corrupt_msg (r, _("position %d: Variable name begins with "
709 "lowercase letter %c."),
710 i, name[0] - 100 + 'a');
711 name[0] -= 26 /* a - A */;
714 /* Verify remaining characters of variable name. */
715 for (j = 1; j < (int) strlen (name); j++)
719 if (c >= 100 /* a */ && c <= 125 /* z */)
721 corrupt_msg (r, _("position %d: Variable name character %d "
722 "is lowercase letter %c."),
723 i, j + 1, c - 100 + 'a');
724 name[j] -= 26 /* z - Z */;
726 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
727 || c == 127 /* . */ || c == 152 /* @ */
728 || c == 136 /* $ */ || c == 146 /* _ */)
731 lose ((r, _("position %d: character `\\%03o' is not "
732 "valid in a variable name."), i, c));
736 if (width < 0 || width > 255)
737 lose ((r, "Bad width %d for variable %s.", width, name));
739 v = dict_create_var (dict, name, width);
741 lose ((r, _("Duplicate variable name %s."), name));
742 if (!convert_format (r, &fmt[0], &v->print, v))
744 if (!convert_format (r, &fmt[3], &v->write, v))
747 /* Range missing values. */
748 if (match (75 /* B */))
750 v->miss_type = MISSING_RANGE;
751 if (!parse_value (r, &v->missing[0], v)
752 || !parse_value (r, &v->missing[1], v))
755 else if (match (74 /* A */))
757 v->miss_type = MISSING_HIGH;
758 if (!parse_value (r, &v->missing[0], v))
761 else if (match (73 /* 9 */))
763 v->miss_type = MISSING_LOW;
764 if (!parse_value (r, &v->missing[0], v))
768 /* Single missing values. */
769 while (match (72 /* 8 */))
771 static const int map_next[MISSING_COUNT] =
773 MISSING_1, MISSING_2, MISSING_3, -1,
774 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
778 static const int map_ofs[MISSING_COUNT] =
780 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
783 v->miss_type = map_next[v->miss_type];
784 if (v->miss_type == -1)
785 lose ((r, _("Bad missing values for %s."), v->name));
787 assert (map_ofs[v->miss_type] != -1);
788 if (!parse_value (r, &v->missing[map_ofs[v->miss_type]], v))
792 if (match (76 /* C */))
794 char *label = read_string (r);
799 v->label = xstrdup (label);
804 if (weight_name != NULL)
806 struct variable *weight_var = dict_lookup_var (dict, weight_name);
807 if (weight_var == NULL)
808 lose ((r, _("Weighting variable %s not present in dictionary."),
812 dict_set_weight (dict, weight_var);
822 /* Parse a value for variable VV into value V. Returns success. */
824 parse_value (struct pfm_reader *r, union value *v, struct variable *vv)
826 if (vv->type == ALPHA)
828 char *mv = read_string (r);
834 strncpy (v->s, mv, 8);
835 for (j = 0; j < 8; j++)
837 v->s[j] = spss2ascii[v->s[j]];
839 /* Value labels are always padded with spaces. */
844 v->f = read_float (r);
845 if (v->f == second_lowest_value)
852 /* Parse a value label record and return success. */
854 read_value_label (struct pfm_reader *r, struct dictionary *dict)
869 v = xmalloc (sizeof *v * nv);
870 for (i = 0; i < nv; i++)
872 char *name = read_string (r);
877 v[i] = dict_lookup_var (dict, name);
879 lose ((r, _("Unknown variable %s while parsing value labels."), name));
881 if (v[0]->width != v[i]->width)
882 lose ((r, _("Cannot assign value labels to %s and %s, which "
883 "have different variable types or widths."),
884 v[0]->name, v[i]->name));
887 n_labels = read_int (r);
888 if (n_labels == NOT_INT)
891 for (i = 0; i < n_labels; i++)
898 if (!parse_value (r, &val, v[0]))
901 label = read_string (r);
906 /* Assign the value_label's to each variable. */
907 for (j = 0; j < nv; j++)
909 struct variable *var = v[j];
911 if (!val_labs_replace (var->val_labs, val, label))
914 if (var->type == NUMERIC)
915 lose ((r, _("Duplicate label for value %g for variable %s."),
918 lose ((r, _("Duplicate label for value `%.*s' for variable %s."),
919 var->width, val.s, var->name));
930 /* Reads one case from portable file R into C. Returns nonzero
931 only if successful. */
933 pfm_read_case (struct pfm_reader *r, struct ccase *c)
938 /* Check for end of file. */
939 if (r->cc == 99 /* Z */)
943 for (i = 0; i < r->var_cnt; i++)
945 int width = r->widths[i];
949 double f = read_float (r);
950 if (f == second_lowest_value)
953 case_data_rw (c, idx)->f = f;
958 char *s = read_string (r);
963 st_bare_pad_copy (case_data_rw (c, idx)->s, s, width);
964 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
971 lose ((r, _("End of file midway through case.")));