1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
4 Code for parsing floating-point numbers adapted from GNU C
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
33 #include "dictionary.h"
34 #include "file-handle.h"
41 #include "value-labels.h"
44 #include "debug-print.h"
46 /* Portable file reader. */
49 struct file_handle *fh; /* File handle. */
50 FILE *file; /* File stream. */
52 int weight_index; /* 0-based index of weight variable, or -1. */
54 unsigned char *trans; /* 256-byte character set translation table. */
56 int var_cnt; /* Number of variables. */
57 int *widths; /* Variable widths, 0 for numeric. */
58 int value_cnt; /* Number of `value's per case. */
60 unsigned char buf[83]; /* Input buffer. */
61 unsigned char *bp; /* Buffer pointer. */
62 int cc; /* Current character. */
66 corrupt_msg (struct pfm_reader *r, const char *format,...)
69 /* Displays a corruption error. */
71 corrupt_msg (struct pfm_reader *r, const char *format, ...)
78 va_start (args, format);
79 vsnprintf (buf, 1024, format, args);
89 getl_location (&e.where.filename, &e.where.line_number);
90 filename = handle_get_filename (r->fh);
91 e.title = title = local_alloc (strlen (filename) + 80);
92 sprintf (title, _("portable file %s corrupt at offset %ld: "),
93 filename, ftell (r->file) - (82 - (long) (r->bp - r->buf)));
104 /* Closes a portable file after we're done with it. */
106 pfm_close_reader (struct pfm_reader *r)
112 fh_close (r->fh, "portable file", "rs");
113 if (fclose (r->file) == EOF)
114 msg (ME, _("%s: Closing portable file: %s."),
115 handle_get_filename (r->fh), strerror (errno));
121 /* Displays the message X with corrupt_msg, then jumps to the error
129 /* Read an 80-character line into handle H's buffer. Return
132 fill_buf (struct pfm_reader *r)
134 if (80 != fread (r->buf, 1, 80, r->file))
135 lose ((r, _("Unexpected end of file.")));
137 /* PORTME: line ends. */
142 if (c != '\n' && c != '\r')
143 lose ((r, _("Bad line end.")));
146 if (c != '\n' && c != '\r')
154 for (i = 0; i < 80; i++)
155 r->buf[i] = r->trans[r->buf[i]];
166 /* Read a single character into cur_char. Return success; */
168 read_char (struct pfm_reader *r)
170 if (r->bp >= &r->buf[80] && !fill_buf (r))
176 /* Advance a single character. */
179 if (!read_char (r)) \
183 /* Skip a single character if present, and return whether it was
186 skip_char (struct pfm_reader *r, int c)
197 /* Skip a single character if present, and return whether it was
199 #define match(C) skip_char (r, C)
201 static int read_header (struct pfm_reader *);
202 static int read_version_data (struct pfm_reader *, struct pfm_read_info *);
203 static int read_variables (struct pfm_reader *, struct dictionary *);
204 static int read_value_label (struct pfm_reader *, struct dictionary *);
205 void dump_dictionary (struct dictionary *);
207 /* Reads the dictionary from file with handle H, and returns it in a
208 dictionary structure. This dictionary may be modified in order to
209 rename, reorder, and delete variables, etc. */
211 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
212 struct pfm_read_info *info)
214 struct pfm_reader *r = NULL;
216 *dict = dict_create ();
217 if (!fh_open (fh, "portable file", "rs"))
220 /* Create and initialize reader. */
221 r = xmalloc (sizeof *r);
223 r->file = fopen (handle_get_filename (r->fh), "rb");
224 r->weight_index = -1;
231 /* Check that file open succeeded, prime reading. */
234 msg (ME, _("An error occurred while opening \"%s\" for reading "
235 "as a portable file: %s."),
236 handle_get_filename (r->fh), strerror (errno));
244 /* Read header, version, date info, product id, variables. */
246 || !read_version_data (r, info)
247 || !read_variables (r, *dict))
250 /* Read value labels. */
251 while (match (77 /* D */))
252 if (!read_value_label (r, *dict))
255 /* Check that we've made it to the data. */
256 if (!match (79 /* F */))
257 lose ((r, _("Data record expected.")));
262 pfm_close_reader (r);
263 dict_destroy (*dict);
268 /* Read a floating point value and return its value, or
269 second_lowest_value on error. */
271 read_float (struct pfm_reader *r)
279 /* Skip leading spaces. */
280 while (match (126 /* space */))
283 if (match (137 /* * */))
285 advance (); /* Probably a dot (.) but doesn't appear to matter. */
288 else if (match (141 /* - */))
293 if (r->cc >= 64 /* 0 */ && r->cc <= 93 /* T */)
297 /* Make sure that multiplication by 30 will not overflow. */
298 if (num > DBL_MAX * (1. / 30.))
299 /* The value of the digit doesn't matter, since we have already
300 gotten as many digits as can be represented in a `double'.
301 This doesn't necessarily mean the result will overflow.
302 The exponent may reduce it to within range.
304 We just need to record that there was another
305 digit so that we can multiply by 10 later. */
308 num = (num * 30.0) + (r->cc - 64);
310 /* Keep track of the number of digits after the decimal point.
311 If we just divided by 30 here, we would lose precision. */
315 else if (!got_dot && r->cc == 127 /* . */)
316 /* Record that we have found the decimal point. */
319 /* Any other character terminates the number. */
326 lose ((r, "Number expected."));
328 if (r->cc == 130 /* + */ || r->cc == 141 /* - */)
330 /* Get the exponent. */
332 int neg_exp = r->cc == 141 /* - */;
338 if (r->cc < 64 /* 0 */ || r->cc > 93 /* T */)
341 if (exp > LONG_MAX / 30)
343 exp = exp * 30 + (r->cc - 64);
346 /* We don't check whether there were actually any digits, but we
353 if (!match (142 /* / */))
354 lose ((r, _("Missing numeric terminator.")));
356 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
359 num *= pow (30.0, (double) exponent);
360 else if (exponent > 0)
362 if (num > DBL_MAX * pow (30.0, (double) -exponent))
364 num *= pow (30.0, (double) exponent);
374 return -DBL_MAX / 10.;
379 return second_lowest_value;
382 /* Read an integer and return its value, or NOT_INT on failure. */
384 read_int (struct pfm_reader *r)
386 double f = read_float (r);
388 if (f == second_lowest_value)
390 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
391 lose ((r, _("Bad integer format.")));
398 /* Reads a string and returns its value in a static buffer, or NULL on
399 failure. The buffer can be deallocated by calling with a NULL
401 static unsigned char *
402 read_string (struct pfm_reader *r)
413 else if (buf == NULL)
419 if (n < 0 || n > 255)
420 lose ((r, _("Bad string length %d."), n));
425 for (i = 0; i < n; i++)
439 /* Reads the 464-byte file header. */
441 read_header (struct pfm_reader *r)
443 /* For now at least, just ignore the vanity splash strings. */
447 for (i = 0; i < 200; i++)
452 unsigned char src[256];
456 for (i = 0; i < 256; i++)
458 src[i] = (unsigned char) r->cc;
462 for (i = 0; i < 256; i++)
465 /* 0 is used to mark untranslatable characters, so we have to mark
467 trans_temp[src[64]] = 64;
468 for (i = 0; i < 256; i++)
469 if (trans_temp[src[i]] == -1)
470 trans_temp[src[i]] = i;
472 r->trans = xmalloc (256);
473 for (i = 0; i < 256; i++)
474 r->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
476 /* Translate the input buffer. */
477 for (i = 0; i < 80; i++)
478 r->buf[i] = r->trans[r->buf[i]];
479 r->cc = r->trans[r->cc];
483 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
486 for (i = 0; i < 8; i++)
488 lose ((r, "Missing SPSSPORT signature."));
497 /* Reads the version and date info record, as well as product and
498 subproduct identification records if present. */
500 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
503 if (!match (74 /* A */))
504 lose ((r, "Unrecognized version code %d.", r->cc));
508 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
509 char *date = read_string (r);
514 if (strlen (date) != 8)
515 lose ((r, _("Bad date string length %d."), strlen (date)));
516 for (i = 0; i < 8; i++)
518 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
519 lose ((r, _("Bad character in date.")));
521 info->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
525 info->creation_date[2] = info->creation_date[5] = ' ';
526 info->creation_date[10] = 0;
532 static const int map[] = {0, 1, 3, 4, 6, 7};
533 char *time = read_string (r);
538 if (strlen (time) != 6)
539 lose ((r, _("Bad time string length %d."), strlen (time)));
540 for (i = 0; i < 6; i++)
542 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
543 lose ((r, _("Bad character in time.")));
545 info->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
549 info->creation_time[2] = info->creation_time[5] = ' ';
550 info->creation_time[8] = 0;
555 if (match (65 /* 1 */))
559 product = read_string (r);
563 strncpy (info->product, product, 61);
566 info->product[0] = 0;
569 if (match (67 /* 3 */))
573 subproduct = read_string (r);
574 if (subproduct == NULL)
577 strncpy (info->subproduct, subproduct, 61);
580 info->subproduct[0] = 0;
588 convert_format (struct pfm_reader *r, int fmt[3], struct fmt_spec *v,
591 v->type = translate_fmt (fmt[0]);
593 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
597 /* FIXME? Should verify the resulting specifier more thoroughly. */
600 lose ((r, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
601 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
602 lose ((r, _("%s variable %s has %s format specifier %s."),
603 vv->type == ALPHA ? _("String") : _("Numeric"),
605 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
606 formats[v->type].name));
613 /* Translation table from SPSS character code to this computer's
614 native character code (which is probably ASCII). */
615 static const unsigned char spss2ascii[256] =
618 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
619 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
623 /* Translate string S into ASCII. */
628 *s = spss2ascii[(unsigned char) *s];
631 static int parse_value (struct pfm_reader *, union value *, struct variable *);
633 /* Read information on all the variables. */
635 read_variables (struct pfm_reader *r, struct dictionary *dict)
637 char *weight_name = NULL;
640 if (!match (68 /* 4 */))
641 lose ((r, _("Expected variable count record.")));
643 r->var_cnt = read_int (r);
644 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
645 lose ((r, _("Invalid number of variables %d."), r->var_cnt));
646 r->widths = xmalloc (sizeof *r->widths * r->var_cnt);
648 /* Purpose of this value is unknown. It is typically 161. */
650 int x = read_int (r);
655 corrupt_msg (r, _("Unexpected flag value %d."), x);
658 if (match (70 /* 6 */))
660 weight_name = read_string (r);
664 asciify (weight_name);
665 if (strlen (weight_name) > 8)
667 corrupt_msg (r, _("Weight variable name (%s) truncated."),
669 weight_name[8] = '\0';
673 for (i = 0; i < r->var_cnt; i++)
681 if (!match (71 /* 7 */))
682 lose ((r, _("Expected variable record.")));
684 width = read_int (r);
685 if (width == NOT_INT)
688 lose ((r, _("Invalid variable width %d."), width));
689 r->widths[i] = width;
691 name = read_string (r);
694 for (j = 0; j < 6; j++)
696 fmt[j] = read_int (r);
697 if (fmt[j] == NOT_INT)
701 /* Verify first character of variable name.
703 Weirdly enough, there is no # character in the SPSS portable
704 character set, so we can't check for it. */
705 if (strlen (name) > 8)
706 lose ((r, _("position %d: Variable name has %u characters."),
708 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
709 && name[0] != 152 /* @ */)
710 lose ((r, _("position %d: Variable name begins with invalid "
712 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
714 corrupt_msg (r, _("position %d: Variable name begins with "
715 "lowercase letter %c."),
716 i, name[0] - 100 + 'a');
717 name[0] -= 26 /* a - A */;
720 /* Verify remaining characters of variable name. */
721 for (j = 1; j < (int) strlen (name); j++)
725 if (c >= 100 /* a */ && c <= 125 /* z */)
727 corrupt_msg (r, _("position %d: Variable name character %d "
728 "is lowercase letter %c."),
729 i, j + 1, c - 100 + 'a');
730 name[j] -= 26 /* z - Z */;
732 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
733 || c == 127 /* . */ || c == 152 /* @ */
734 || c == 136 /* $ */ || c == 146 /* _ */)
737 lose ((r, _("position %d: character `\\%03o' is not "
738 "valid in a variable name."), i, c));
742 if (width < 0 || width > 255)
743 lose ((r, "Bad width %d for variable %s.", width, name));
745 v = dict_create_var (dict, name, width);
747 lose ((r, _("Duplicate variable name %s."), name));
748 if (!convert_format (r, &fmt[0], &v->print, v))
750 if (!convert_format (r, &fmt[3], &v->write, v))
753 /* Range missing values. */
754 if (match (75 /* B */))
756 v->miss_type = MISSING_RANGE;
757 if (!parse_value (r, &v->missing[0], v)
758 || !parse_value (r, &v->missing[1], v))
761 else if (match (74 /* A */))
763 v->miss_type = MISSING_HIGH;
764 if (!parse_value (r, &v->missing[0], v))
767 else if (match (73 /* 9 */))
769 v->miss_type = MISSING_LOW;
770 if (!parse_value (r, &v->missing[0], v))
774 /* Single missing values. */
775 while (match (72 /* 8 */))
777 static const int map_next[MISSING_COUNT] =
779 MISSING_1, MISSING_2, MISSING_3, -1,
780 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
784 static const int map_ofs[MISSING_COUNT] =
786 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
789 v->miss_type = map_next[v->miss_type];
790 if (v->miss_type == -1)
791 lose ((r, _("Bad missing values for %s."), v->name));
793 assert (map_ofs[v->miss_type] != -1);
794 if (!parse_value (r, &v->missing[map_ofs[v->miss_type]], v))
798 if (match (76 /* C */))
800 char *label = read_string (r);
805 v->label = xstrdup (label);
810 if (weight_name != NULL)
812 struct variable *weight_var = dict_lookup_var (dict, weight_name);
813 if (weight_var == NULL)
814 lose ((r, _("Weighting variable %s not present in dictionary."),
818 dict_set_weight (dict, weight_var);
828 /* Parse a value for variable VV into value V. Returns success. */
830 parse_value (struct pfm_reader *r, union value *v, struct variable *vv)
832 if (vv->type == ALPHA)
834 char *mv = read_string (r);
840 strncpy (v->s, mv, 8);
841 for (j = 0; j < 8; j++)
843 v->s[j] = spss2ascii[v->s[j]];
845 /* Value labels are always padded with spaces. */
850 v->f = read_float (r);
851 if (v->f == second_lowest_value)
858 /* Parse a value label record and return success. */
860 read_value_label (struct pfm_reader *r, struct dictionary *dict)
875 v = xmalloc (sizeof *v * nv);
876 for (i = 0; i < nv; i++)
878 char *name = read_string (r);
883 v[i] = dict_lookup_var (dict, name);
885 lose ((r, _("Unknown variable %s while parsing value labels."), name));
887 if (v[0]->width != v[i]->width)
888 lose ((r, _("Cannot assign value labels to %s and %s, which "
889 "have different variable types or widths."),
890 v[0]->name, v[i]->name));
893 n_labels = read_int (r);
894 if (n_labels == NOT_INT)
897 for (i = 0; i < n_labels; i++)
904 if (!parse_value (r, &val, v[0]))
907 label = read_string (r);
912 /* Assign the value_label's to each variable. */
913 for (j = 0; j < nv; j++)
915 struct variable *var = v[j];
917 if (!val_labs_replace (var->val_labs, val, label))
920 if (var->type == NUMERIC)
921 lose ((r, _("Duplicate label for value %g for variable %s."),
924 lose ((r, _("Duplicate label for value `%.*s' for variable %s."),
925 var->width, val.s, var->name));
936 /* Reads one case from portable file R into C. Returns nonzero
937 only if successful. */
939 pfm_read_case (struct pfm_reader *r, struct ccase *c)
944 /* Check for end of file. */
945 if (r->cc == 99 /* Z */)
949 for (i = 0; i < r->var_cnt; i++)
951 int width = r->widths[i];
955 double f = read_float (r);
956 if (f == second_lowest_value)
959 case_data_rw (c, idx)->f = f;
964 char *s = read_string (r);
969 st_bare_pad_copy (case_data_rw (c, idx)->s, s, width);
970 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
977 lose ((r, _("End of file midway through case.")));