1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
4 Code for parsing floating-point numbers adapted from GNU C
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
33 #include "dictionary.h"
34 #include "file-handle.h"
41 #include "value-labels.h"
44 #include "debug-print.h"
46 /* Portable file reader. */
49 struct file_handle *fh; /* File handle. */
50 FILE *file; /* File stream. */
52 int weight_index; /* 0-based index of weight variable, or -1. */
54 unsigned char *trans; /* 256-byte character set translation table. */
56 int var_cnt; /* Number of variables. */
57 int *widths; /* Variable widths, 0 for numeric. */
58 int value_cnt; /* Number of `value's per case. */
60 unsigned char buf[83]; /* Input buffer. */
61 unsigned char *bp; /* Buffer pointer. */
62 int cc; /* Current character. */
66 corrupt_msg (struct pfm_reader *r, const char *format,...)
69 /* Displays a corruption error. */
71 corrupt_msg (struct pfm_reader *r, const char *format, ...)
78 va_start (args, format);
79 vsnprintf (buf, 1024, format, args);
89 getl_location (&e.where.filename, &e.where.line_number);
90 filename = handle_get_filename (r->fh);
91 e.title = title = local_alloc (strlen (filename) + 80);
92 sprintf (title, _("portable file %s corrupt at offset %ld: "),
93 filename, ftell (r->file) - (82 - (long) (r->bp - r->buf)));
104 /* Closes a portable file after we're done with it. */
106 pfm_close_reader (struct pfm_reader *r)
112 fh_close (r->fh, "portable file", "rs");
113 if (fclose (r->file) == EOF)
114 msg (ME, _("%s: Closing portable file: %s."),
115 handle_get_filename (r->fh), strerror (errno));
120 /* Displays the message X with corrupt_msg, then jumps to the error
128 /* Read an 80-character line into handle H's buffer. Return
131 fill_buf (struct pfm_reader *r)
133 if (80 != fread (r->buf, 1, 80, r->file))
134 lose ((r, _("Unexpected end of file.")));
136 /* PORTME: line ends. */
141 if (c != '\n' && c != '\r')
142 lose ((r, _("Bad line end.")));
145 if (c != '\n' && c != '\r')
153 for (i = 0; i < 80; i++)
154 r->buf[i] = r->trans[r->buf[i]];
165 /* Read a single character into cur_char. Return success; */
167 read_char (struct pfm_reader *r)
169 if (r->bp >= &r->buf[80] && !fill_buf (r))
175 /* Advance a single character. */
178 if (!read_char (r)) \
182 /* Skip a single character if present, and return whether it was
185 skip_char (struct pfm_reader *r, int c)
196 /* Skip a single character if present, and return whether it was
198 #define match(C) skip_char (r, C)
200 static int read_header (struct pfm_reader *);
201 static int read_version_data (struct pfm_reader *, struct pfm_read_info *);
202 static int read_variables (struct pfm_reader *, struct dictionary *);
203 static int read_value_label (struct pfm_reader *, struct dictionary *);
204 void dump_dictionary (struct dictionary *);
206 /* Reads the dictionary from file with handle H, and returns it in a
207 dictionary structure. This dictionary may be modified in order to
208 rename, reorder, and delete variables, etc. */
210 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
211 struct pfm_read_info *info)
213 struct pfm_reader *r = NULL;
215 *dict = dict_create ();
216 if (!fh_open (fh, "portable file", "rs"))
219 /* Create and initialize reader. */
220 r = xmalloc (sizeof *r);
222 r->file = fopen (handle_get_filename (r->fh), "rb");
223 r->weight_index = -1;
230 /* Check that file open succeeded, prime reading. */
233 msg (ME, _("An error occurred while opening \"%s\" for reading "
234 "as a portable file: %s."),
235 handle_get_filename (r->fh), strerror (errno));
243 /* Read header, version, date info, product id, variables. */
245 || !read_version_data (r, info)
246 || !read_variables (r, *dict))
249 /* Read value labels. */
250 while (match (77 /* D */))
251 if (!read_value_label (r, *dict))
254 /* Check that we've made it to the data. */
255 if (!match (79 /* F */))
256 lose ((r, _("Data record expected.")));
261 pfm_close_reader (r);
262 dict_destroy (*dict);
267 /* Read a floating point value and return its value, or
268 second_lowest_value on error. */
270 read_float (struct pfm_reader *r)
278 /* Skip leading spaces. */
279 while (match (126 /* space */))
282 if (match (137 /* * */))
284 advance (); /* Probably a dot (.) but doesn't appear to matter. */
287 else if (match (141 /* - */))
292 if (r->cc >= 64 /* 0 */ && r->cc <= 93 /* T */)
296 /* Make sure that multiplication by 30 will not overflow. */
297 if (num > DBL_MAX * (1. / 30.))
298 /* The value of the digit doesn't matter, since we have already
299 gotten as many digits as can be represented in a `double'.
300 This doesn't necessarily mean the result will overflow.
301 The exponent may reduce it to within range.
303 We just need to record that there was another
304 digit so that we can multiply by 10 later. */
307 num = (num * 30.0) + (r->cc - 64);
309 /* Keep track of the number of digits after the decimal point.
310 If we just divided by 30 here, we would lose precision. */
314 else if (!got_dot && r->cc == 127 /* . */)
315 /* Record that we have found the decimal point. */
318 /* Any other character terminates the number. */
325 lose ((r, "Number expected."));
327 if (r->cc == 130 /* + */ || r->cc == 141 /* - */)
329 /* Get the exponent. */
331 int neg_exp = r->cc == 141 /* - */;
337 if (r->cc < 64 /* 0 */ || r->cc > 93 /* T */)
340 if (exp > LONG_MAX / 30)
342 exp = exp * 30 + (r->cc - 64);
345 /* We don't check whether there were actually any digits, but we
352 if (!match (142 /* / */))
353 lose ((r, _("Missing numeric terminator.")));
355 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
358 num *= pow (30.0, (double) exponent);
359 else if (exponent > 0)
361 if (num > DBL_MAX * pow (30.0, (double) -exponent))
363 num *= pow (30.0, (double) exponent);
373 return -DBL_MAX / 10.;
378 return second_lowest_value;
381 /* Read an integer and return its value, or NOT_INT on failure. */
383 read_int (struct pfm_reader *r)
385 double f = read_float (r);
387 if (f == second_lowest_value)
389 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
390 lose ((r, _("Bad integer format.")));
397 /* Reads a string and returns its value in a static buffer, or NULL on
398 failure. The buffer can be deallocated by calling with a NULL
400 static unsigned char *
401 read_string (struct pfm_reader *r)
412 else if (buf == NULL)
418 if (n < 0 || n > 255)
419 lose ((r, _("Bad string length %d."), n));
424 for (i = 0; i < n; i++)
438 /* Reads the 464-byte file header. */
440 read_header (struct pfm_reader *r)
442 /* For now at least, just ignore the vanity splash strings. */
446 for (i = 0; i < 200; i++)
451 unsigned char src[256];
455 for (i = 0; i < 256; i++)
457 src[i] = (unsigned char) r->cc;
461 for (i = 0; i < 256; i++)
464 /* 0 is used to mark untranslatable characters, so we have to mark
466 trans_temp[src[64]] = 64;
467 for (i = 0; i < 256; i++)
468 if (trans_temp[src[i]] == -1)
469 trans_temp[src[i]] = i;
471 r->trans = xmalloc (256);
472 for (i = 0; i < 256; i++)
473 r->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
475 /* Translate the input buffer. */
476 for (i = 0; i < 80; i++)
477 r->buf[i] = r->trans[r->buf[i]];
478 r->cc = r->trans[r->cc];
482 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
485 for (i = 0; i < 8; i++)
487 lose ((r, "Missing SPSSPORT signature."));
496 /* Reads the version and date info record, as well as product and
497 subproduct identification records if present. */
499 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
502 if (!match (74 /* A */))
503 lose ((r, "Unrecognized version code %d.", r->cc));
507 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
508 char *date = read_string (r);
513 if (strlen (date) != 8)
514 lose ((r, _("Bad date string length %d."), strlen (date)));
515 for (i = 0; i < 8; i++)
517 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
518 lose ((r, _("Bad character in date.")));
520 info->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
524 info->creation_date[2] = info->creation_date[5] = ' ';
525 info->creation_date[10] = 0;
531 static const int map[] = {0, 1, 3, 4, 6, 7};
532 char *time = read_string (r);
537 if (strlen (time) != 6)
538 lose ((r, _("Bad time string length %d."), strlen (time)));
539 for (i = 0; i < 6; i++)
541 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
542 lose ((r, _("Bad character in time.")));
544 info->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
548 info->creation_time[2] = info->creation_time[5] = ' ';
549 info->creation_time[8] = 0;
554 if (match (65 /* 1 */))
558 product = read_string (r);
562 strncpy (info->product, product, 61);
565 info->product[0] = 0;
568 if (match (67 /* 3 */))
572 subproduct = read_string (r);
573 if (subproduct == NULL)
576 strncpy (info->subproduct, subproduct, 61);
579 info->subproduct[0] = 0;
587 convert_format (struct pfm_reader *r, int fmt[3], struct fmt_spec *v,
590 v->type = translate_fmt (fmt[0]);
592 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
596 /* FIXME? Should verify the resulting specifier more thoroughly. */
599 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
600 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
601 lose ((r, _("%s variable %s has %s format specifier %s."),
602 vv->type == ALPHA ? _("String") : _("Numeric"),
604 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
605 formats[v->type].name));
612 /* Translation table from SPSS character code to this computer's
613 native character code (which is probably ASCII). */
614 static const unsigned char spss2ascii[256] =
617 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
618 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
622 /* Translate string S into ASCII. */
627 *s = spss2ascii[(unsigned char) *s];
630 static int parse_value (struct pfm_reader *, union value *, struct variable *);
632 /* Read information on all the variables. */
634 read_variables (struct pfm_reader *r, struct dictionary *dict)
636 char *weight_name = NULL;
639 if (!match (68 /* 4 */))
640 lose ((r, _("Expected variable count record.")));
642 r->var_cnt = read_int (r);
643 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
644 lose ((r, _("Invalid number of variables %d."), r->var_cnt));
645 r->widths = xmalloc (sizeof *r->widths * r->var_cnt);
647 /* Purpose of this value is unknown. It is typically 161. */
649 int x = read_int (r);
654 corrupt_msg (r, _("Unexpected flag value %d."), x);
657 if (match (70 /* 6 */))
659 weight_name = read_string (r);
663 asciify (weight_name);
664 if (strlen (weight_name) > 8)
666 corrupt_msg (r, _("Weight variable name (%s) truncated."),
668 weight_name[8] = '\0';
672 for (i = 0; i < r->var_cnt; i++)
680 if (!match (71 /* 7 */))
681 lose ((r, _("Expected variable record.")));
683 width = read_int (r);
684 if (width == NOT_INT)
687 lose ((r, _("Invalid variable width %d."), width));
688 r->widths[i] = width;
690 name = read_string (r);
693 for (j = 0; j < 6; j++)
695 fmt[j] = read_int (r);
696 if (fmt[j] == NOT_INT)
700 /* Verify first character of variable name.
702 Weirdly enough, there is no # character in the SPSS portable
703 character set, so we can't check for it. */
704 if (strlen (name) > 8)
705 lose ((r, _("position %d: Variable name has %u characters."),
707 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
708 && name[0] != 152 /* @ */)
709 lose ((r, _("position %d: Variable name begins with invalid "
711 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
713 corrupt_msg (r, _("position %d: Variable name begins with "
714 "lowercase letter %c."),
715 i, name[0] - 100 + 'a');
716 name[0] -= 26 /* a - A */;
719 /* Verify remaining characters of variable name. */
720 for (j = 1; j < (int) strlen (name); j++)
724 if (c >= 100 /* a */ && c <= 125 /* z */)
726 corrupt_msg (r, _("position %d: Variable name character %d "
727 "is lowercase letter %c."),
728 i, j + 1, c - 100 + 'a');
729 name[j] -= 26 /* z - Z */;
731 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
732 || c == 127 /* . */ || c == 152 /* @ */
733 || c == 136 /* $ */ || c == 146 /* _ */)
736 lose ((r, _("position %d: character `\\%03o' is not "
737 "valid in a variable name."), i, c));
741 if (width < 0 || width > 255)
742 lose ((r, "Bad width %d for variable %s.", width, name));
744 v = dict_create_var (dict, name, width);
746 lose ((r, _("Duplicate variable name %s."), name));
747 if (!convert_format (r, &fmt[0], &v->print, v))
749 if (!convert_format (r, &fmt[3], &v->write, v))
752 /* Range missing values. */
753 if (match (75 /* B */))
755 v->miss_type = MISSING_RANGE;
756 if (!parse_value (r, &v->missing[0], v)
757 || !parse_value (r, &v->missing[1], v))
760 else if (match (74 /* A */))
762 v->miss_type = MISSING_HIGH;
763 if (!parse_value (r, &v->missing[0], v))
766 else if (match (73 /* 9 */))
768 v->miss_type = MISSING_LOW;
769 if (!parse_value (r, &v->missing[0], v))
773 /* Single missing values. */
774 while (match (72 /* 8 */))
776 static const int map_next[MISSING_COUNT] =
778 MISSING_1, MISSING_2, MISSING_3, -1,
779 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
783 static const int map_ofs[MISSING_COUNT] =
785 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
788 v->miss_type = map_next[v->miss_type];
789 if (v->miss_type == -1)
790 lose ((r, _("Bad missing values for %s."), v->name));
792 assert (map_ofs[v->miss_type] != -1);
793 if (!parse_value (r, &v->missing[map_ofs[v->miss_type]], v))
797 if (match (76 /* C */))
799 char *label = read_string (r);
804 v->label = xstrdup (label);
809 if (weight_name != NULL)
811 struct variable *weight_var = dict_lookup_var (dict, weight_name);
812 if (weight_var == NULL)
813 lose ((r, _("Weighting variable %s not present in dictionary."),
817 dict_set_weight (dict, weight_var);
827 /* Parse a value for variable VV into value V. Returns success. */
829 parse_value (struct pfm_reader *r, union value *v, struct variable *vv)
831 if (vv->type == ALPHA)
833 char *mv = read_string (r);
839 strncpy (v->s, mv, 8);
840 for (j = 0; j < 8; j++)
842 v->s[j] = spss2ascii[v->s[j]];
844 /* Value labels are always padded with spaces. */
849 v->f = read_float (r);
850 if (v->f == second_lowest_value)
857 /* Parse a value label record and return success. */
859 read_value_label (struct pfm_reader *r, struct dictionary *dict)
874 v = xmalloc (sizeof *v * nv);
875 for (i = 0; i < nv; i++)
877 char *name = read_string (r);
882 v[i] = dict_lookup_var (dict, name);
884 lose ((r, _("Unknown variable %s while parsing value labels."), name));
886 if (v[0]->width != v[i]->width)
887 lose ((r, _("Cannot assign value labels to %s and %s, which "
888 "have different variable types or widths."),
889 v[0]->name, v[i]->name));
892 n_labels = read_int (r);
893 if (n_labels == NOT_INT)
896 for (i = 0; i < n_labels; i++)
903 if (!parse_value (r, &val, v[0]))
906 label = read_string (r);
911 /* Assign the value_label's to each variable. */
912 for (j = 0; j < nv; j++)
914 struct variable *var = v[j];
916 if (!val_labs_replace (var->val_labs, val, label))
919 if (var->type == NUMERIC)
920 lose ((r, _("Duplicate label for value %g for variable %s."),
923 lose ((r, _("Duplicate label for value `%.*s' for variable %s."),
924 var->width, val.s, var->name));
935 /* Reads one case from portable file R into C. Returns nonzero
936 only if successful. */
938 pfm_read_case (struct pfm_reader *r, struct ccase *c)
943 /* Check for end of file. */
944 if (r->cc == 99 /* Z */)
948 for (i = 0; i < r->var_cnt; i++)
950 int width = r->widths[i];
954 double f = read_float (r);
955 if (f == second_lowest_value)
958 case_data_rw (c, idx)->f = f;
963 char *s = read_string (r);
968 st_bare_pad_copy (case_data_rw (c, idx)->s, s, width);
969 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
976 lose ((r, _("End of file midway through case.")));