1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 /* AIX requires this to be the first thing in the file. */
23 #define alloca __builtin_alloca
31 #ifndef alloca /* predefined by HP cc +Olibcalls */
47 #include "file-handle.h"
57 /*#define DEBUGGING 1*/
58 #include "debug-print.h"
60 /* pfm's file_handle extension. */
63 FILE *file; /* Actual file. */
65 struct dictionary *dict; /* File's dictionary. */
66 int weight_index; /* 0-based index of weight variable, or -1. */
68 unsigned char *trans; /* 256-byte character set translation table. */
70 int nvars; /* Number of variables. */
71 int *vars; /* Variable widths, 0 for numeric. */
72 int case_size; /* Number of `value's per case. */
74 unsigned char buf[83]; /* Input buffer. */
75 unsigned char *bp; /* Buffer pointer. */
76 int cc; /* Current character. */
79 static struct fh_ext_class pfm_r_class;
82 corrupt_msg (struct file_handle *h, const char *format,...)
83 __attribute__ ((format (printf, 2, 3)));
85 /* Displays a corruption error. */
87 corrupt_msg (struct file_handle *h, const char *format, ...)
89 struct pfm_fhuser_ext *ext = h->ext;
95 va_start (args, format);
96 vsnprintf (buf, 1024, format, args);
105 getl_location (&e.where.filename, &e.where.line_number);
106 e.title = title = local_alloc (strlen (h->fn) + 80);
107 sprintf (title, _("portable file %s corrupt at offset %ld: "),
108 h->fn, ftell (ext->file) - (82 - (long) (ext->bp - ext->buf)));
119 /* Closes a portable file after we're done with it. */
121 pfm_close (struct file_handle * h)
123 struct pfm_fhuser_ext *ext = h->ext;
125 if (EOF == fclose (ext->file))
126 msg (ME, _("%s: Closing portable file: %s."), h->fn, strerror (errno));
132 /* Displays the message X with corrupt_msg, then jumps to the lossage
142 /* Read an 80-character line into handle H's buffer. Return
145 fill_buf (struct file_handle *h)
147 struct pfm_fhuser_ext *ext = h->ext;
149 if (80 != fread (ext->buf, 1, 80, ext->file))
150 lose ((h, _("Unexpected end of file.")));
152 /* PORTME: line ends. */
156 c = getc (ext->file);
157 if (c != '\n' && c != '\r')
158 lose ((h, _("Bad line end.")));
160 c = getc (ext->file);
161 if (c != '\n' && c != '\r')
162 ungetc (c, ext->file);
169 for (i = 0; i < 80; i++)
170 ext->buf[i] = ext->trans[ext->buf[i]];
181 /* Read a single character into cur_char. Return success; */
183 read_char (struct file_handle *h)
185 struct pfm_fhuser_ext *ext = h->ext;
187 if (ext->bp >= &ext->buf[80] && !fill_buf (h))
189 ext->cc = *ext->bp++;
193 /* Advance a single character. */
194 #define advance() if (!read_char (h)) goto lossage
196 /* Skip a single character if present, and return whether it was
199 skip_char (struct file_handle *h, int c)
201 struct pfm_fhuser_ext *ext = h->ext;
212 /* Skip a single character if present, and return whether it was
214 #define match(C) skip_char (h, C)
216 static int read_header (struct file_handle *h);
217 static int read_version_data (struct file_handle *h, struct pfm_read_info *inf);
218 static int read_variables (struct file_handle *h);
219 static int read_value_label (struct file_handle *h);
220 void dump_dictionary (struct dictionary *dict);
222 /* Reads the dictionary from file with handle H, and returns it in a
223 dictionary structure. This dictionary may be modified in order to
224 rename, reorder, and delete variables, etc. */
226 pfm_read_dictionary (struct file_handle *h, struct pfm_read_info *inf)
228 /* The file handle extension record. */
229 struct pfm_fhuser_ext *ext;
231 /* Check whether the file is already open. */
232 if (h->class == &pfm_r_class)
237 else if (h->class != NULL)
239 msg (ME, _("Cannot read file %s as portable file: already opened "
241 fh_handle_name (h), h->class->name);
245 msg (VM (1), _("%s: Opening portable-file handle %s for reading."),
246 fh_handle_filename (h), fh_handle_name (h));
248 /* Open the physical disk file. */
249 ext = xmalloc (sizeof (struct pfm_fhuser_ext));
250 ext->file = fopen (h->norm_fn, "rb");
251 if (ext->file == NULL)
253 msg (ME, _("An error occurred while opening \"%s\" for reading "
254 "as a portable file: %s."), h->fn, strerror (errno));
260 /* Initialize the sfm_fhuser_ext structure. */
261 h->class = &pfm_r_class;
269 /* Read the header. */
270 if (!read_header (h))
273 /* Read version, date info, product identification. */
274 if (!read_version_data (h, inf))
277 /* Read variables. */
278 if (!read_variables (h))
282 while (match (77 /* D */))
283 if (!read_value_label (h))
286 if (!match (79 /* F */))
287 lose ((h, _("Data record expected.")));
289 msg (VM (2), _("Read portable-file dictionary successfully."));
292 dump_dictionary (ext->dict);
297 /* Come here on unsuccessful completion. */
298 msg (VM (1), _("Error reading portable-file dictionary."));
301 if (ext && ext->dict)
302 free_dictionary (ext->dict);
309 /* Read a floating point value and return its value, or
310 second_lowest_value on error. */
312 read_float (struct file_handle *h)
314 struct pfm_fhuser_ext *ext = h->ext;
321 /* Skip leading spaces. */
322 while (match (126 /* space */))
325 if (match (137 /* * */))
327 advance (); /* Probably a dot (.) but doesn't appear to matter. */
330 else if (match (141 /* - */))
335 if (ext->cc >= 64 /* 0 */ && ext->cc <= 93 /* T */)
339 /* Make sure that multiplication by 30 will not overflow. */
340 if (num > DBL_MAX * (1. / 30.))
341 /* The value of the digit doesn't matter, since we have already
342 gotten as many digits as can be represented in a `double'.
343 This doesn't necessarily mean the result will overflow.
344 The exponent may reduce it to within range.
346 We just need to record that there was another
347 digit so that we can multiply by 10 later. */
350 num = (num * 30.0) + (ext->cc - 64);
352 /* Keep track of the number of digits after the decimal point.
353 If we just divided by 30 here, we would lose precision. */
357 else if (!got_dot && ext->cc == 127 /* . */)
358 /* Record that we have found the decimal point. */
361 /* Any other character terminates the number. */
368 lose ((h, "Number expected."));
370 if (ext->cc == 130 /* + */ || ext->cc == 141 /* - */)
372 /* Get the exponent. */
374 int neg_exp = ext->cc == 141 /* - */;
380 if (ext->cc < 64 /* 0 */ || ext->cc > 93 /* T */)
383 if (exp > LONG_MAX / 30)
385 exp = exp * 30 + (ext->cc - 64);
388 /* We don't check whether there were actually any digits, but we
395 if (!match (142 /* / */))
396 lose ((h, _("Missing numeric terminator.")));
398 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
401 num *= pow (30.0, (double) exponent);
402 else if (exponent > 0)
404 if (num > DBL_MAX * pow (30.0, (double) -exponent))
406 num *= pow (30.0, (double) exponent);
416 return -DBL_MAX / 10.;
421 return second_lowest_value;
424 /* Read an integer and return its value, or NOT_INT on failure. */
426 read_int (struct file_handle *h)
428 double f = read_float (h);
430 if (f == second_lowest_value)
432 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
433 lose ((h, _("Bad integer format.")));
440 /* Reads a string and returns its value in a static buffer, or NULL on
441 failure. The buffer can be deallocated by calling with a NULL
443 static unsigned char *
444 read_string (struct file_handle *h)
446 struct pfm_fhuser_ext *ext = h->ext;
456 else if (buf == NULL)
462 if (n < 0 || n > 255)
463 lose ((h, _("Bad string length %d."), n));
468 for (i = 0; i < n; i++)
482 /* Reads the 464-byte file header. */
484 read_header (struct file_handle *h)
486 struct pfm_fhuser_ext *ext = h->ext;
488 /* For now at least, just ignore the vanity splash strings. */
492 for (i = 0; i < 200; i++)
497 unsigned char src[256];
501 for (i = 0; i < 256; i++)
503 src[i] = (unsigned char) ext->cc;
507 for (i = 0; i < 256; i++)
510 /* 0 is used to mark untranslatable characters, so we have to mark
512 trans_temp[src[64]] = 64;
513 for (i = 0; i < 256; i++)
514 if (trans_temp[src[i]] == -1)
515 trans_temp[src[i]] = i;
517 ext->trans = xmalloc (256);
518 for (i = 0; i < 256; i++)
519 ext->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
521 /* Translate the input buffer. */
522 for (i = 0; i < 80; i++)
523 ext->buf[i] = ext->trans[ext->buf[i]];
524 ext->cc = ext->trans[ext->cc];
528 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
531 for (i = 0; i < 8; i++)
533 lose ((h, "Missing SPSSPORT signature."));
542 /* Reads the version and date info record, as well as product and
543 subproduct identification records if present. */
545 read_version_data (struct file_handle *h, struct pfm_read_info *inf)
547 struct pfm_fhuser_ext *ext = h->ext;
550 if (!match (74 /* A */))
551 lose ((h, "Unrecognized version code %d.", ext->cc));
555 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
556 char *date = read_string (h);
561 if (strlen (date) != 8)
562 lose ((h, _("Bad date string length %d."), strlen (date)));
563 for (i = 0; i < 8; i++)
565 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
566 lose ((h, _("Bad character in date.")));
568 inf->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
572 inf->creation_date[2] = inf->creation_date[5] = ' ';
573 inf->creation_date[10] = 0;
579 static const int map[] = {0, 1, 3, 4, 6, 7};
580 char *time = read_string (h);
585 if (strlen (time) != 6)
586 lose ((h, _("Bad time string length %d."), strlen (time)));
587 for (i = 0; i < 6; i++)
589 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
590 lose ((h, _("Bad character in time.")));
592 inf->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
596 inf->creation_time[2] = inf->creation_time[5] = ' ';
597 inf->creation_time[8] = 0;
602 if (match (65 /* 1 */))
606 product = read_string (h);
610 strncpy (inf->product, product, 61);
616 if (match (67 /* 3 */))
620 subproduct = read_string (h);
621 if (subproduct == NULL)
624 strncpy (inf->subproduct, subproduct, 61);
627 inf->subproduct[0] = 0;
635 convert_format (struct file_handle *h, int fmt[3], struct fmt_spec *v,
639 || (size_t) fmt[0] >= sizeof translate_fmt / sizeof *translate_fmt)
640 lose ((h, _("%s: Bad format specifier byte %d."), vv->name, fmt[0]));
642 v->type = translate_fmt[fmt[0]];
646 /* FIXME? Should verify the resulting specifier more thoroughly. */
649 lose ((h, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
650 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
651 lose ((h, _("%s variable %s has %s format specifier %s."),
652 vv->type == ALPHA ? _("String") : _("Numeric"),
654 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
655 formats[v->type].name));
662 /* Translation table from SPSS character code to this computer's
663 native character code (which is probably ASCII). */
664 static const unsigned char spss2ascii[256] =
667 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
668 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
672 /* Translate string S into ASCII. */
677 *s = spss2ascii[(unsigned char) *s];
680 static int parse_value (struct file_handle *, union value *, struct variable *);
682 /* Read information on all the variables. */
684 read_variables (struct file_handle *h)
686 struct pfm_fhuser_ext *ext = h->ext;
689 if (!match (68 /* 4 */))
690 lose ((h, _("Expected variable count record.")));
692 ext->nvars = read_int (h);
693 if (ext->nvars <= 0 || ext->nvars == NOT_INT)
694 lose ((h, _("Invalid number of variables %d."), ext->nvars));
695 ext->vars = xmalloc (sizeof *ext->vars * ext->nvars);
697 /* Purpose of this value is unknown. It is typically 161. */
699 int x = read_int (h);
704 corrupt_msg (h, _("Unexpected flag value %d."), x);
707 ext->dict = new_dictionary (0);
709 if (match (70 /* 6 */))
711 char *name = read_string (h);
715 strcpy (ext->dict->weight_var, name);
716 asciify (ext->dict->weight_var);
719 for (i = 0; i < ext->nvars; i++)
727 if (!match (71 /* 7 */))
728 lose ((h, _("Expected variable record.")));
730 width = read_int (h);
731 if (width == NOT_INT)
734 lose ((h, _("Invalid variable width %d."), width));
735 ext->vars[i] = width;
737 name = read_string (h);
740 for (j = 0; j < 6; j++)
742 fmt[j] = read_int (h);
743 if (fmt[j] == NOT_INT)
747 /* Verify first character of variable name.
749 Weirdly enough, there is no # character in the SPSS portable
750 character set, so we can't check for it. */
751 if (strlen (name) > 8)
752 lose ((h, _("position %d: Variable name has %u characters."),
754 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
755 && name[0] != 152 /* @ */)
756 lose ((h, _("position %d: Variable name begins with invalid "
758 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
760 corrupt_msg (h, _("position %d: Variable name begins with "
761 "lowercase letter %c."),
762 i, name[0] - 100 + 'a');
763 name[0] -= 26 /* a - A */;
766 /* Verify remaining characters of variable name. */
767 for (j = 1; j < (int) strlen (name); j++)
771 if (c >= 100 /* a */ && c <= 125 /* z */)
773 corrupt_msg (h, _("position %d: Variable name character %d "
774 "is lowercase letter %c."),
775 i, j + 1, c - 100 + 'a');
776 name[j] -= 26 /* z - Z */;
778 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
779 || c == 127 /* . */ || c == 152 /* @ */
780 || c == 136 /* $ */ || c == 146 /* _ */)
783 lose ((h, _("position %d: character `\\%03o' is not "
784 "valid in a variable name."), i, c));
788 if (width < 0 || width > 255)
789 lose ((h, "Bad width %d for variable %s.", width, name));
791 v = create_variable (ext->dict, name, width ? ALPHA : NUMERIC, width);
794 lose ((h, _("Duplicate variable name %s."), name));
795 if (!convert_format (h, &fmt[0], &v->print, v))
797 if (!convert_format (h, &fmt[3], &v->write, v))
800 /* Range missing values. */
801 if (match (75 /* B */))
803 v->miss_type = MISSING_RANGE;
804 if (!parse_value (h, &v->missing[0], v)
805 || !parse_value (h, &v->missing[1], v))
808 else if (match (74 /* A */))
810 v->miss_type = MISSING_HIGH;
811 if (!parse_value (h, &v->missing[0], v))
814 else if (match (73 /* 9 */))
816 v->miss_type = MISSING_LOW;
817 if (!parse_value (h, &v->missing[0], v))
821 /* Single missing values. */
822 while (match (72 /* 8 */))
824 static const int map_next[MISSING_COUNT] =
826 MISSING_1, MISSING_2, MISSING_3, -1,
827 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
831 static const int map_ofs[MISSING_COUNT] =
833 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
836 v->miss_type = map_next[v->miss_type];
837 if (v->miss_type == -1)
838 lose ((h, _("Bad missing values for %s."), v->name));
840 assert (map_ofs[v->miss_type] != -1);
841 if (!parse_value (h, &v->missing[map_ofs[v->miss_type]], v))
845 if (match (76 /* C */))
847 char *label = read_string (h);
852 v->label = xstrdup (label);
856 ext->case_size = ext->dict->nval;
858 if (ext->dict->weight_var[0] != 0
859 && !find_dict_variable (ext->dict, ext->dict->weight_var))
860 lose ((h, _("Weighting variable %s not present in dictionary."),
861 ext->dict->weight_var));
869 /* Parse a value for variable VV into value V. Returns success. */
871 parse_value (struct file_handle *h, union value *v, struct variable *vv)
873 if (vv->type == ALPHA)
875 char *mv = read_string (h);
881 strncpy (v->s, mv, 8);
882 for (j = 0; j < 8; j++)
884 v->s[j] = spss2ascii[v->s[j]];
886 /* Value labels are always padded with spaces. */
891 v->f = read_float (h);
892 if (v->f == second_lowest_value)
899 /* Parse a value label record and return success. */
901 read_value_label (struct file_handle *h)
903 struct pfm_fhuser_ext *ext = h->ext;
918 v = xmalloc (sizeof *v * nv);
919 for (i = 0; i < nv; i++)
921 char *name = read_string (h);
926 v[i] = find_dict_variable (ext->dict, name);
928 lose ((h, _("Unknown variable %s while parsing value labels."), name));
930 if (v[0]->width != v[i]->width)
931 lose ((h, _("Cannot assign value labels to %s and %s, which "
932 "have different variable types or widths."),
933 v[0]->name, v[i]->name));
936 n_labels = read_int (h);
937 if (n_labels == NOT_INT)
940 for (i = 0; i < n_labels; i++)
944 struct value_label *vl;
948 if (!parse_value (h, &val, v[0]))
951 label = read_string (h);
956 /* Create a label. */
957 vl = xmalloc (sizeof *vl);
959 vl->s = xstrdup (label);
962 /* Assign the value_label's to each variable. */
963 for (j = 0; j < nv; j++)
965 struct variable *var = v[j];
966 struct value_label *old;
968 /* Create AVL tree if necessary. */
970 var->val_lab = avl_create (NULL, val_lab_cmp,
971 (void *) (var->width));
973 old = avl_replace (var->val_lab, vl);
977 if (var->type == NUMERIC)
978 lose ((h, _("Duplicate label for value %g for variable %s."),
979 vl->v.f, var->name));
981 lose ((h, _("Duplicate label for value `%.*s' for variable %s."),
982 var->width, vl->v.s, var->name));
984 free_value_label (old);
995 /* Reads one case from portable file H into the value array PERM
996 according to the instuctions given in associated dictionary DICT,
997 which must have the get.fv elements appropriately set. Returns
998 nonzero only if successful. */
1000 pfm_read_case (struct file_handle *h, union value *perm, struct dictionary *dict)
1002 struct pfm_fhuser_ext *ext = h->ext;
1004 union value *temp, *tp;
1007 /* Check for end of file. */
1008 if (ext->cc == 99 /* Z */)
1011 /* The first concern is to obtain a full case relative to the data
1012 file. (Cases in the data file have no particular relationship to
1013 cases in the active file.) */
1014 tp = temp = local_alloc (sizeof *tp * ext->case_size);
1015 for (tp = temp, i = 0; i < ext->nvars; i++)
1016 if (ext->vars[i] == 0)
1018 tp->f = read_float (h);
1019 if (tp->f == second_lowest_value)
1020 goto unexpected_eof;
1025 char *s = read_string (h);
1027 goto unexpected_eof;
1030 st_bare_pad_copy (tp->s, s, ext->vars[i]);
1031 tp += DIV_RND_UP (ext->vars[i], MAX_SHORT_STRING);
1034 /* Translate a case in data file format to a case in active file
1036 for (i = 0; i < dict->nvar; i++)
1038 struct variable *v = dict->var[i];
1040 if (v->get.fv == -1)
1043 if (v->type == NUMERIC)
1044 perm[v->fv].f = temp[v->get.fv].f;
1046 memcpy (&perm[v->fv].s, &temp[v->get.fv], v->width);
1053 lose ((h, _("End of file midway through case.")));
1060 static struct fh_ext_class pfm_r_class =
1063 N_("reading as a portable file"),