1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 /* AIX requires this to be the first thing in the file. */
23 #define alloca __builtin_alloca
31 #ifndef alloca /* predefined by HP cc +Olibcalls */
47 #include "file-handle.h"
56 #include "debug-print.h"
58 /* pfm's file_handle extension. */
61 FILE *file; /* Actual file. */
63 struct dictionary *dict; /* File's dictionary. */
64 int weight_index; /* 0-based index of weight variable, or -1. */
66 unsigned char *trans; /* 256-byte character set translation table. */
68 int nvars; /* Number of variables. */
69 int *vars; /* Variable widths, 0 for numeric. */
70 int case_size; /* Number of `value's per case. */
72 unsigned char buf[83]; /* Input buffer. */
73 unsigned char *bp; /* Buffer pointer. */
74 int cc; /* Current character. */
77 static struct fh_ext_class pfm_r_class;
80 corrupt_msg (struct file_handle *h, const char *format,...)
81 __attribute__ ((format (printf, 2, 3)));
83 /* Displays a corruption error. */
85 corrupt_msg (struct file_handle *h, const char *format, ...)
87 struct pfm_fhuser_ext *ext = h->ext;
93 va_start (args, format);
94 vsnprintf (buf, 1024, format, args);
103 getl_location (&e.where.filename, &e.where.line_number);
104 e.title = title = local_alloc (strlen (h->fn) + 80);
105 sprintf (title, _("portable file %s corrupt at offset %ld: "),
106 h->fn, ftell (ext->file) - (82 - (long) (ext->bp - ext->buf)));
117 /* Closes a portable file after we're done with it. */
119 pfm_close (struct file_handle * h)
121 struct pfm_fhuser_ext *ext = h->ext;
123 if (EOF == fclose (ext->file))
124 msg (ME, _("%s: Closing portable file: %s."), h->fn, strerror (errno));
130 /* Displays the message X with corrupt_msg, then jumps to the lossage
140 /* Read an 80-character line into handle H's buffer. Return
143 fill_buf (struct file_handle *h)
145 struct pfm_fhuser_ext *ext = h->ext;
147 if (80 != fread (ext->buf, 1, 80, ext->file))
148 lose ((h, _("Unexpected end of file.")));
150 /* PORTME: line ends. */
154 c = getc (ext->file);
155 if (c != '\n' && c != '\r')
156 lose ((h, _("Bad line end.")));
158 c = getc (ext->file);
159 if (c != '\n' && c != '\r')
160 ungetc (c, ext->file);
167 for (i = 0; i < 80; i++)
168 ext->buf[i] = ext->trans[ext->buf[i]];
179 /* Read a single character into cur_char. Return success; */
181 read_char (struct file_handle *h)
183 struct pfm_fhuser_ext *ext = h->ext;
185 if (ext->bp >= &ext->buf[80] && !fill_buf (h))
187 ext->cc = *ext->bp++;
191 /* Advance a single character. */
192 #define advance() if (!read_char (h)) goto lossage
194 /* Skip a single character if present, and return whether it was
197 skip_char (struct file_handle *h, int c)
199 struct pfm_fhuser_ext *ext = h->ext;
210 /* Skip a single character if present, and return whether it was
212 #define match(C) skip_char (h, C)
214 static int read_header (struct file_handle *h);
215 static int read_version_data (struct file_handle *h, struct pfm_read_info *inf);
216 static int read_variables (struct file_handle *h);
217 static int read_value_label (struct file_handle *h);
218 void dump_dictionary (struct dictionary *dict);
220 /* Reads the dictionary from file with handle H, and returns it in a
221 dictionary structure. This dictionary may be modified in order to
222 rename, reorder, and delete variables, etc. */
224 pfm_read_dictionary (struct file_handle *h, struct pfm_read_info *inf)
226 /* The file handle extension record. */
227 struct pfm_fhuser_ext *ext;
229 /* Check whether the file is already open. */
230 if (h->class == &pfm_r_class)
235 else if (h->class != NULL)
237 msg (ME, _("Cannot read file %s as portable file: already opened "
239 fh_handle_name (h), h->class->name);
243 msg (VM (1), _("%s: Opening portable-file handle %s for reading."),
244 fh_handle_filename (h), fh_handle_name (h));
246 /* Open the physical disk file. */
247 ext = xmalloc (sizeof (struct pfm_fhuser_ext));
248 ext->file = fopen (h->norm_fn, "rb");
249 if (ext->file == NULL)
251 msg (ME, _("An error occurred while opening \"%s\" for reading "
252 "as a portable file: %s."), h->fn, strerror (errno));
258 /* Initialize the sfm_fhuser_ext structure. */
259 h->class = &pfm_r_class;
267 /* Read the header. */
268 if (!read_header (h))
271 /* Read version, date info, product identification. */
272 if (!read_version_data (h, inf))
275 /* Read variables. */
276 if (!read_variables (h))
280 while (match (77 /* D */))
281 if (!read_value_label (h))
284 if (!match (79 /* F */))
285 lose ((h, _("Data record expected.")));
287 msg (VM (2), _("Read portable-file dictionary successfully."));
290 dump_dictionary (ext->dict);
295 /* Come here on unsuccessful completion. */
296 msg (VM (1), _("Error reading portable-file dictionary."));
299 if (ext && ext->dict)
300 free_dictionary (ext->dict);
307 /* Read a floating point value and return its value, or
308 second_lowest_value on error. */
310 read_float (struct file_handle *h)
312 struct pfm_fhuser_ext *ext = h->ext;
319 /* Skip leading spaces. */
320 while (match (126 /* space */))
323 if (match (137 /* * */))
325 advance (); /* Probably a dot (.) but doesn't appear to matter. */
328 else if (match (141 /* - */))
333 if (ext->cc >= 64 /* 0 */ && ext->cc <= 93 /* T */)
337 /* Make sure that multiplication by 30 will not overflow. */
338 if (num > DBL_MAX * (1. / 30.))
339 /* The value of the digit doesn't matter, since we have already
340 gotten as many digits as can be represented in a `double'.
341 This doesn't necessarily mean the result will overflow.
342 The exponent may reduce it to within range.
344 We just need to record that there was another
345 digit so that we can multiply by 10 later. */
348 num = (num * 30.0) + (ext->cc - 64);
350 /* Keep track of the number of digits after the decimal point.
351 If we just divided by 30 here, we would lose precision. */
355 else if (!got_dot && ext->cc == 127 /* . */)
356 /* Record that we have found the decimal point. */
359 /* Any other character terminates the number. */
366 lose ((h, "Number expected."));
368 if (ext->cc == 130 /* + */ || ext->cc == 141 /* - */)
370 /* Get the exponent. */
372 int neg_exp = ext->cc == 141 /* - */;
378 if (ext->cc < 64 /* 0 */ || ext->cc > 93 /* T */)
381 if (exp > LONG_MAX / 30)
383 exp = exp * 30 + (ext->cc - 64);
386 /* We don't check whether there were actually any digits, but we
393 if (!match (142 /* / */))
394 lose ((h, _("Missing numeric terminator.")));
396 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
399 num *= pow (30.0, (double) exponent);
400 else if (exponent > 0)
402 if (num > DBL_MAX * pow (30.0, (double) -exponent))
404 num *= pow (30.0, (double) exponent);
414 return -DBL_MAX / 10.;
419 return second_lowest_value;
422 /* Read an integer and return its value, or NOT_INT on failure. */
424 read_int (struct file_handle *h)
426 double f = read_float (h);
428 if (f == second_lowest_value)
430 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
431 lose ((h, _("Bad integer format.")));
438 /* Reads a string and returns its value in a static buffer, or NULL on
439 failure. The buffer can be deallocated by calling with a NULL
441 static unsigned char *
442 read_string (struct file_handle *h)
444 struct pfm_fhuser_ext *ext = h->ext;
454 else if (buf == NULL)
460 if (n < 0 || n > 255)
461 lose ((h, _("Bad string length %d."), n));
466 for (i = 0; i < n; i++)
480 /* Reads the 464-byte file header. */
482 read_header (struct file_handle *h)
484 struct pfm_fhuser_ext *ext = h->ext;
486 /* For now at least, just ignore the vanity splash strings. */
490 for (i = 0; i < 200; i++)
495 unsigned char src[256];
499 for (i = 0; i < 256; i++)
501 src[i] = (unsigned char) ext->cc;
505 for (i = 0; i < 256; i++)
508 /* 0 is used to mark untranslatable characters, so we have to mark
510 trans_temp[src[64]] = 64;
511 for (i = 0; i < 256; i++)
512 if (trans_temp[src[i]] == -1)
513 trans_temp[src[i]] = i;
515 ext->trans = xmalloc (256);
516 for (i = 0; i < 256; i++)
517 ext->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
519 /* Translate the input buffer. */
520 for (i = 0; i < 80; i++)
521 ext->buf[i] = ext->trans[ext->buf[i]];
522 ext->cc = ext->trans[ext->cc];
526 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
529 for (i = 0; i < 8; i++)
531 lose ((h, "Missing SPSSPORT signature."));
540 /* Reads the version and date info record, as well as product and
541 subproduct identification records if present. */
543 read_version_data (struct file_handle *h, struct pfm_read_info *inf)
545 struct pfm_fhuser_ext *ext = h->ext;
548 if (!match (74 /* A */))
549 lose ((h, "Unrecognized version code %d.", ext->cc));
553 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
554 char *date = read_string (h);
559 if (strlen (date) != 8)
560 lose ((h, _("Bad date string length %d."), strlen (date)));
561 for (i = 0; i < 8; i++)
563 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
564 lose ((h, _("Bad character in date.")));
566 inf->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
570 inf->creation_date[2] = inf->creation_date[5] = ' ';
571 inf->creation_date[10] = 0;
577 static const int map[] = {0, 1, 3, 4, 6, 7};
578 char *time = read_string (h);
583 if (strlen (time) != 6)
584 lose ((h, _("Bad time string length %d."), strlen (time)));
585 for (i = 0; i < 6; i++)
587 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
588 lose ((h, _("Bad character in time.")));
590 inf->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
594 inf->creation_time[2] = inf->creation_time[5] = ' ';
595 inf->creation_time[8] = 0;
600 if (match (65 /* 1 */))
604 product = read_string (h);
608 strncpy (inf->product, product, 61);
614 if (match (67 /* 3 */))
618 subproduct = read_string (h);
619 if (subproduct == NULL)
622 strncpy (inf->subproduct, subproduct, 61);
625 inf->subproduct[0] = 0;
633 convert_format (struct file_handle *h, int fmt[3], struct fmt_spec *v,
637 || (size_t) fmt[0] >= sizeof translate_fmt / sizeof *translate_fmt)
638 lose ((h, _("%s: Bad format specifier byte %d."), vv->name, fmt[0]));
640 v->type = translate_fmt[fmt[0]];
644 /* FIXME? Should verify the resulting specifier more thoroughly. */
647 lose ((h, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
648 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
649 lose ((h, _("%s variable %s has %s format specifier %s."),
650 vv->type == ALPHA ? _("String") : _("Numeric"),
652 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
653 formats[v->type].name));
660 /* Translation table from SPSS character code to this computer's
661 native character code (which is probably ASCII). */
662 static const unsigned char spss2ascii[256] =
665 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
666 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
670 /* Translate string S into ASCII. */
675 *s = spss2ascii[(unsigned char) *s];
678 static int parse_value (struct file_handle *, union value *, struct variable *);
680 /* Read information on all the variables. */
682 read_variables (struct file_handle *h)
684 struct pfm_fhuser_ext *ext = h->ext;
687 if (!match (68 /* 4 */))
688 lose ((h, _("Expected variable count record.")));
690 ext->nvars = read_int (h);
691 if (ext->nvars <= 0 || ext->nvars == NOT_INT)
692 lose ((h, _("Invalid number of variables %d."), ext->nvars));
693 ext->vars = xmalloc (sizeof *ext->vars * ext->nvars);
695 /* Purpose of this value is unknown. It is typically 161. */
697 int x = read_int (h);
702 corrupt_msg (h, _("Unexpected flag value %d."), x);
705 ext->dict = new_dictionary (0);
707 if (match (70 /* 6 */))
709 char *name = read_string (h);
713 strcpy (ext->dict->weight_var, name);
714 asciify (ext->dict->weight_var);
717 for (i = 0; i < ext->nvars; i++)
725 if (!match (71 /* 7 */))
726 lose ((h, _("Expected variable record.")));
728 width = read_int (h);
729 if (width == NOT_INT)
732 lose ((h, _("Invalid variable width %d."), width));
733 ext->vars[i] = width;
735 name = read_string (h);
738 for (j = 0; j < 6; j++)
740 fmt[j] = read_int (h);
741 if (fmt[j] == NOT_INT)
745 /* Verify first character of variable name.
747 Weirdly enough, there is no # character in the SPSS portable
748 character set, so we can't check for it. */
749 if (strlen (name) > 8)
750 lose ((h, _("position %d: Variable name has %u characters."),
752 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
753 && name[0] != 152 /* @ */)
754 lose ((h, _("position %d: Variable name begins with invalid "
756 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
758 corrupt_msg (h, _("position %d: Variable name begins with "
759 "lowercase letter %c."),
760 i, name[0] - 100 + 'a');
761 name[0] -= 26 /* a - A */;
764 /* Verify remaining characters of variable name. */
765 for (j = 1; j < (int) strlen (name); j++)
769 if (c >= 100 /* a */ && c <= 125 /* z */)
771 corrupt_msg (h, _("position %d: Variable name character %d "
772 "is lowercase letter %c."),
773 i, j + 1, c - 100 + 'a');
774 name[j] -= 26 /* z - Z */;
776 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
777 || c == 127 /* . */ || c == 152 /* @ */
778 || c == 136 /* $ */ || c == 146 /* _ */)
781 lose ((h, _("position %d: character `\\%03o' is not "
782 "valid in a variable name."), i, c));
786 if (width < 0 || width > 255)
787 lose ((h, "Bad width %d for variable %s.", width, name));
789 v = create_variable (ext->dict, name, width ? ALPHA : NUMERIC, width);
792 lose ((h, _("Duplicate variable name %s."), name));
793 if (!convert_format (h, &fmt[0], &v->print, v))
795 if (!convert_format (h, &fmt[3], &v->write, v))
798 /* Range missing values. */
799 if (match (75 /* B */))
801 v->miss_type = MISSING_RANGE;
802 if (!parse_value (h, &v->missing[0], v)
803 || !parse_value (h, &v->missing[1], v))
806 else if (match (74 /* A */))
808 v->miss_type = MISSING_HIGH;
809 if (!parse_value (h, &v->missing[0], v))
812 else if (match (73 /* 9 */))
814 v->miss_type = MISSING_LOW;
815 if (!parse_value (h, &v->missing[0], v))
819 /* Single missing values. */
820 while (match (72 /* 8 */))
822 static const int map_next[MISSING_COUNT] =
824 MISSING_1, MISSING_2, MISSING_3, -1,
825 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
829 static const int map_ofs[MISSING_COUNT] =
831 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
834 v->miss_type = map_next[v->miss_type];
835 if (v->miss_type == -1)
836 lose ((h, _("Bad missing values for %s."), v->name));
838 assert (map_ofs[v->miss_type] != -1);
839 if (!parse_value (h, &v->missing[map_ofs[v->miss_type]], v))
843 if (match (76 /* C */))
845 char *label = read_string (h);
850 v->label = xstrdup (label);
854 ext->case_size = ext->dict->nval;
856 if (ext->dict->weight_var[0] != 0
857 && !find_dict_variable (ext->dict, ext->dict->weight_var))
858 lose ((h, _("Weighting variable %s not present in dictionary."),
859 ext->dict->weight_var));
867 /* Parse a value for variable VV into value V. Returns success. */
869 parse_value (struct file_handle *h, union value *v, struct variable *vv)
871 if (vv->type == ALPHA)
873 char *mv = read_string (h);
879 strncpy (v->s, mv, 8);
880 for (j = 0; j < 8; j++)
882 v->s[j] = spss2ascii[v->s[j]];
884 /* Value labels are always padded with spaces. */
889 v->f = read_float (h);
890 if (v->f == second_lowest_value)
897 /* Parse a value label record and return success. */
899 read_value_label (struct file_handle *h)
901 struct pfm_fhuser_ext *ext = h->ext;
916 v = xmalloc (sizeof *v * nv);
917 for (i = 0; i < nv; i++)
919 char *name = read_string (h);
924 v[i] = find_dict_variable (ext->dict, name);
926 lose ((h, _("Unknown variable %s while parsing value labels."), name));
928 if (v[0]->width != v[i]->width)
929 lose ((h, _("Cannot assign value labels to %s and %s, which "
930 "have different variable types or widths."),
931 v[0]->name, v[i]->name));
934 n_labels = read_int (h);
935 if (n_labels == NOT_INT)
938 for (i = 0; i < n_labels; i++)
942 struct value_label *vl;
946 if (!parse_value (h, &val, v[0]))
949 label = read_string (h);
954 /* Create a label. */
955 vl = xmalloc (sizeof *vl);
957 vl->s = xstrdup (label);
960 /* Assign the value_label's to each variable. */
961 for (j = 0; j < nv; j++)
963 struct variable *var = v[j];
964 struct value_label *old;
966 /* Create AVL tree if necessary. */
968 var->val_lab = avl_create (NULL, val_lab_cmp,
969 (void *) (var->width));
971 old = avl_replace (var->val_lab, vl);
975 if (var->type == NUMERIC)
976 lose ((h, _("Duplicate label for value %g for variable %s."),
977 vl->v.f, var->name));
979 lose ((h, _("Duplicate label for value `%.*s' for variable %s."),
980 var->width, vl->v.s, var->name));
982 free_value_label (old);
993 /* Reads one case from portable file H into the value array PERM
994 according to the instuctions given in associated dictionary DICT,
995 which must have the get.fv elements appropriately set. Returns
996 nonzero only if successful. */
998 pfm_read_case (struct file_handle *h, union value *perm, struct dictionary *dict)
1000 struct pfm_fhuser_ext *ext = h->ext;
1002 union value *temp, *tp;
1005 /* Check for end of file. */
1006 if (ext->cc == 99 /* Z */)
1009 /* The first concern is to obtain a full case relative to the data
1010 file. (Cases in the data file have no particular relationship to
1011 cases in the active file.) */
1012 tp = temp = local_alloc (sizeof *tp * ext->case_size);
1013 for (tp = temp, i = 0; i < ext->nvars; i++)
1014 if (ext->vars[i] == 0)
1016 tp->f = read_float (h);
1017 if (tp->f == second_lowest_value)
1018 goto unexpected_eof;
1023 char *s = read_string (h);
1025 goto unexpected_eof;
1028 st_bare_pad_copy (tp->s, s, ext->vars[i]);
1029 tp += DIV_RND_UP (ext->vars[i], MAX_SHORT_STRING);
1032 /* Translate a case in data file format to a case in active file
1034 for (i = 0; i < dict->nvar; i++)
1036 struct variable *v = dict->var[i];
1038 if (v->get.fv == -1)
1041 if (v->type == NUMERIC)
1042 perm[v->fv].f = temp[v->get.fv].f;
1044 memcpy (&perm[v->fv].s, &temp[v->get.fv], v->width);
1051 lose ((h, _("End of file midway through case.")));
1058 static struct fh_ext_class pfm_r_class =
1061 N_("reading as a portable file"),