1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
30 #include "file-handle.h"
37 #include "value-labels.h"
40 #include "debug-print.h"
42 /* pfm's file_handle extension. */
45 FILE *file; /* Actual file. */
47 struct dictionary *dict; /* File's dictionary. */
48 int weight_index; /* 0-based index of weight variable, or -1. */
50 unsigned char *trans; /* 256-byte character set translation table. */
52 int nvars; /* Number of variables. */
53 int *vars; /* Variable widths, 0 for numeric. */
54 int case_size; /* Number of `value's per case. */
56 unsigned char buf[83]; /* Input buffer. */
57 unsigned char *bp; /* Buffer pointer. */
58 int cc; /* Current character. */
61 static struct fh_ext_class pfm_r_class;
64 corrupt_msg (struct file_handle *h, const char *format,...)
67 /* Displays a corruption error. */
69 corrupt_msg (struct file_handle *h, const char *format, ...)
71 struct pfm_fhuser_ext *ext = h->ext;
77 va_start (args, format);
78 vsnprintf (buf, 1024, format, args);
88 getl_location (&e.where.filename, &e.where.line_number);
89 filename = handle_get_filename (h);
90 e.title = title = local_alloc (strlen (filename) + 80);
91 sprintf (title, _("portable file %s corrupt at offset %ld: "),
92 filename, ftell (ext->file) - (82 - (long) (ext->bp - ext->buf)));
103 /* Closes a portable file after we're done with it. */
105 pfm_close (struct file_handle *h)
107 struct pfm_fhuser_ext *ext = h->ext;
109 if (EOF == fclose (ext->file))
110 msg (ME, _("%s: Closing portable file: %s."),
111 handle_get_filename (h), strerror (errno));
117 /* Displays the message X with corrupt_msg, then jumps to the lossage
127 /* Read an 80-character line into handle H's buffer. Return
130 fill_buf (struct file_handle *h)
132 struct pfm_fhuser_ext *ext = h->ext;
134 if (80 != fread (ext->buf, 1, 80, ext->file))
135 lose ((h, _("Unexpected end of file.")));
137 /* PORTME: line ends. */
141 c = getc (ext->file);
142 if (c != '\n' && c != '\r')
143 lose ((h, _("Bad line end.")));
145 c = getc (ext->file);
146 if (c != '\n' && c != '\r')
147 ungetc (c, ext->file);
154 for (i = 0; i < 80; i++)
155 ext->buf[i] = ext->trans[ext->buf[i]];
166 /* Read a single character into cur_char. Return success; */
168 read_char (struct file_handle *h)
170 struct pfm_fhuser_ext *ext = h->ext;
172 if (ext->bp >= &ext->buf[80] && !fill_buf (h))
174 ext->cc = *ext->bp++;
178 /* Advance a single character. */
179 #define advance() if (!read_char (h)) goto lossage
181 /* Skip a single character if present, and return whether it was
184 skip_char (struct file_handle *h, int c)
186 struct pfm_fhuser_ext *ext = h->ext;
197 /* Skip a single character if present, and return whether it was
199 #define match(C) skip_char (h, C)
201 static int read_header (struct file_handle *h);
202 static int read_version_data (struct file_handle *h, struct pfm_read_info *inf);
203 static int read_variables (struct file_handle *h);
204 static int read_value_label (struct file_handle *h);
205 void dump_dictionary (struct dictionary *dict);
207 /* Reads the dictionary from file with handle H, and returns it in a
208 dictionary structure. This dictionary may be modified in order to
209 rename, reorder, and delete variables, etc. */
211 pfm_read_dictionary (struct file_handle *h, struct pfm_read_info *inf)
213 /* The file handle extension record. */
214 struct pfm_fhuser_ext *ext;
216 /* Check whether the file is already open. */
217 if (h->class == &pfm_r_class)
222 else if (h->class != NULL)
224 msg (ME, _("Cannot read file %s as portable file: already opened "
226 handle_get_name (h), h->class->name);
230 msg (VM (1), _("%s: Opening portable-file handle %s for reading."),
231 handle_get_filename (h), handle_get_name (h));
233 /* Open the physical disk file. */
234 ext = xmalloc (sizeof (struct pfm_fhuser_ext));
235 ext->file = fopen (handle_get_filename (h), "rb");
236 if (ext->file == NULL)
238 msg (ME, _("An error occurred while opening \"%s\" for reading "
239 "as a portable file: %s."),
240 handle_get_filename (h), strerror (errno));
246 /* Initialize the sfm_fhuser_ext structure. */
247 h->class = &pfm_r_class;
255 /* Read the header. */
256 if (!read_header (h))
259 /* Read version, date info, product identification. */
260 if (!read_version_data (h, inf))
263 /* Read variables. */
264 if (!read_variables (h))
268 while (match (77 /* D */))
269 if (!read_value_label (h))
272 if (!match (79 /* F */))
273 lose ((h, _("Data record expected.")));
275 msg (VM (2), _("Read portable-file dictionary successfully."));
278 dump_dictionary (ext->dict);
283 /* Come here on unsuccessful completion. */
284 msg (VM (1), _("Error reading portable-file dictionary."));
287 if (ext && ext->dict)
288 dict_destroy (ext->dict);
295 /* Read a floating point value and return its value, or
296 second_lowest_value on error. */
298 read_float (struct file_handle *h)
300 struct pfm_fhuser_ext *ext = h->ext;
307 /* Skip leading spaces. */
308 while (match (126 /* space */))
311 if (match (137 /* * */))
313 advance (); /* Probably a dot (.) but doesn't appear to matter. */
316 else if (match (141 /* - */))
321 if (ext->cc >= 64 /* 0 */ && ext->cc <= 93 /* T */)
325 /* Make sure that multiplication by 30 will not overflow. */
326 if (num > DBL_MAX * (1. / 30.))
327 /* The value of the digit doesn't matter, since we have already
328 gotten as many digits as can be represented in a `double'.
329 This doesn't necessarily mean the result will overflow.
330 The exponent may reduce it to within range.
332 We just need to record that there was another
333 digit so that we can multiply by 10 later. */
336 num = (num * 30.0) + (ext->cc - 64);
338 /* Keep track of the number of digits after the decimal point.
339 If we just divided by 30 here, we would lose precision. */
343 else if (!got_dot && ext->cc == 127 /* . */)
344 /* Record that we have found the decimal point. */
347 /* Any other character terminates the number. */
354 lose ((h, "Number expected."));
356 if (ext->cc == 130 /* + */ || ext->cc == 141 /* - */)
358 /* Get the exponent. */
360 int neg_exp = ext->cc == 141 /* - */;
366 if (ext->cc < 64 /* 0 */ || ext->cc > 93 /* T */)
369 if (exp > LONG_MAX / 30)
371 exp = exp * 30 + (ext->cc - 64);
374 /* We don't check whether there were actually any digits, but we
381 if (!match (142 /* / */))
382 lose ((h, _("Missing numeric terminator.")));
384 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
387 num *= pow (30.0, (double) exponent);
388 else if (exponent > 0)
390 if (num > DBL_MAX * pow (30.0, (double) -exponent))
392 num *= pow (30.0, (double) exponent);
402 return -DBL_MAX / 10.;
407 return second_lowest_value;
410 /* Read an integer and return its value, or NOT_INT on failure. */
412 read_int (struct file_handle *h)
414 double f = read_float (h);
416 if (f == second_lowest_value)
418 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
419 lose ((h, _("Bad integer format.")));
426 /* Reads a string and returns its value in a static buffer, or NULL on
427 failure. The buffer can be deallocated by calling with a NULL
429 static unsigned char *
430 read_string (struct file_handle *h)
432 struct pfm_fhuser_ext *ext = h->ext;
442 else if (buf == NULL)
448 if (n < 0 || n > 255)
449 lose ((h, _("Bad string length %d."), n));
454 for (i = 0; i < n; i++)
468 /* Reads the 464-byte file header. */
470 read_header (struct file_handle *h)
472 struct pfm_fhuser_ext *ext = h->ext;
474 /* For now at least, just ignore the vanity splash strings. */
478 for (i = 0; i < 200; i++)
483 unsigned char src[256];
487 for (i = 0; i < 256; i++)
489 src[i] = (unsigned char) ext->cc;
493 for (i = 0; i < 256; i++)
496 /* 0 is used to mark untranslatable characters, so we have to mark
498 trans_temp[src[64]] = 64;
499 for (i = 0; i < 256; i++)
500 if (trans_temp[src[i]] == -1)
501 trans_temp[src[i]] = i;
503 ext->trans = xmalloc (256);
504 for (i = 0; i < 256; i++)
505 ext->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
507 /* Translate the input buffer. */
508 for (i = 0; i < 80; i++)
509 ext->buf[i] = ext->trans[ext->buf[i]];
510 ext->cc = ext->trans[ext->cc];
514 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
517 for (i = 0; i < 8; i++)
519 lose ((h, "Missing SPSSPORT signature."));
528 /* Reads the version and date info record, as well as product and
529 subproduct identification records if present. */
531 read_version_data (struct file_handle *h, struct pfm_read_info *inf)
533 struct pfm_fhuser_ext *ext = h->ext;
536 if (!match (74 /* A */))
537 lose ((h, "Unrecognized version code %d.", ext->cc));
541 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
542 char *date = read_string (h);
547 if (strlen (date) != 8)
548 lose ((h, _("Bad date string length %d."), strlen (date)));
549 for (i = 0; i < 8; i++)
551 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
552 lose ((h, _("Bad character in date.")));
554 inf->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
558 inf->creation_date[2] = inf->creation_date[5] = ' ';
559 inf->creation_date[10] = 0;
565 static const int map[] = {0, 1, 3, 4, 6, 7};
566 char *time = read_string (h);
571 if (strlen (time) != 6)
572 lose ((h, _("Bad time string length %d."), strlen (time)));
573 for (i = 0; i < 6; i++)
575 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
576 lose ((h, _("Bad character in time.")));
578 inf->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
582 inf->creation_time[2] = inf->creation_time[5] = ' ';
583 inf->creation_time[8] = 0;
588 if (match (65 /* 1 */))
592 product = read_string (h);
596 strncpy (inf->product, product, 61);
602 if (match (67 /* 3 */))
606 subproduct = read_string (h);
607 if (subproduct == NULL)
610 strncpy (inf->subproduct, subproduct, 61);
613 inf->subproduct[0] = 0;
621 convert_format (struct file_handle *h, int fmt[3], struct fmt_spec *v,
624 v->type = translate_fmt (fmt[0]);
626 lose ((h, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
630 /* FIXME? Should verify the resulting specifier more thoroughly. */
633 lose ((h, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
634 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
635 lose ((h, _("%s variable %s has %s format specifier %s."),
636 vv->type == ALPHA ? _("String") : _("Numeric"),
638 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
639 formats[v->type].name));
646 /* Translation table from SPSS character code to this computer's
647 native character code (which is probably ASCII). */
648 static const unsigned char spss2ascii[256] =
651 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
652 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
656 /* Translate string S into ASCII. */
661 *s = spss2ascii[(unsigned char) *s];
664 static int parse_value (struct file_handle *, union value *, struct variable *);
666 /* Read information on all the variables. */
668 read_variables (struct file_handle *h)
670 struct pfm_fhuser_ext *ext = h->ext;
671 char *weight_name = NULL;
674 if (!match (68 /* 4 */))
675 lose ((h, _("Expected variable count record.")));
677 ext->nvars = read_int (h);
678 if (ext->nvars <= 0 || ext->nvars == NOT_INT)
679 lose ((h, _("Invalid number of variables %d."), ext->nvars));
680 ext->vars = xmalloc (sizeof *ext->vars * ext->nvars);
682 /* Purpose of this value is unknown. It is typically 161. */
684 int x = read_int (h);
689 corrupt_msg (h, _("Unexpected flag value %d."), x);
692 ext->dict = dict_create ();
694 if (match (70 /* 6 */))
696 weight_name = read_string (h);
700 asciify (weight_name);
701 if (strlen (weight_name) > 8)
703 corrupt_msg (h, _("Weight variable name (%s) truncated."),
705 weight_name[8] = '\0';
709 for (i = 0; i < ext->nvars; i++)
717 if (!match (71 /* 7 */))
718 lose ((h, _("Expected variable record.")));
720 width = read_int (h);
721 if (width == NOT_INT)
724 lose ((h, _("Invalid variable width %d."), width));
725 ext->vars[i] = width;
727 name = read_string (h);
730 for (j = 0; j < 6; j++)
732 fmt[j] = read_int (h);
733 if (fmt[j] == NOT_INT)
737 /* Verify first character of variable name.
739 Weirdly enough, there is no # character in the SPSS portable
740 character set, so we can't check for it. */
741 if (strlen (name) > 8)
742 lose ((h, _("position %d: Variable name has %u characters."),
744 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
745 && name[0] != 152 /* @ */)
746 lose ((h, _("position %d: Variable name begins with invalid "
748 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
750 corrupt_msg (h, _("position %d: Variable name begins with "
751 "lowercase letter %c."),
752 i, name[0] - 100 + 'a');
753 name[0] -= 26 /* a - A */;
756 /* Verify remaining characters of variable name. */
757 for (j = 1; j < (int) strlen (name); j++)
761 if (c >= 100 /* a */ && c <= 125 /* z */)
763 corrupt_msg (h, _("position %d: Variable name character %d "
764 "is lowercase letter %c."),
765 i, j + 1, c - 100 + 'a');
766 name[j] -= 26 /* z - Z */;
768 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
769 || c == 127 /* . */ || c == 152 /* @ */
770 || c == 136 /* $ */ || c == 146 /* _ */)
773 lose ((h, _("position %d: character `\\%03o' is not "
774 "valid in a variable name."), i, c));
778 if (width < 0 || width > 255)
779 lose ((h, "Bad width %d for variable %s.", width, name));
781 v = dict_create_var (ext->dict, name, width);
784 lose ((h, _("Duplicate variable name %s."), name));
785 if (!convert_format (h, &fmt[0], &v->print, v))
787 if (!convert_format (h, &fmt[3], &v->write, v))
790 /* Range missing values. */
791 if (match (75 /* B */))
793 v->miss_type = MISSING_RANGE;
794 if (!parse_value (h, &v->missing[0], v)
795 || !parse_value (h, &v->missing[1], v))
798 else if (match (74 /* A */))
800 v->miss_type = MISSING_HIGH;
801 if (!parse_value (h, &v->missing[0], v))
804 else if (match (73 /* 9 */))
806 v->miss_type = MISSING_LOW;
807 if (!parse_value (h, &v->missing[0], v))
811 /* Single missing values. */
812 while (match (72 /* 8 */))
814 static const int map_next[MISSING_COUNT] =
816 MISSING_1, MISSING_2, MISSING_3, -1,
817 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
821 static const int map_ofs[MISSING_COUNT] =
823 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
826 v->miss_type = map_next[v->miss_type];
827 if (v->miss_type == -1)
828 lose ((h, _("Bad missing values for %s."), v->name));
830 assert (map_ofs[v->miss_type] != -1);
831 if (!parse_value (h, &v->missing[map_ofs[v->miss_type]], v))
835 if (match (76 /* C */))
837 char *label = read_string (h);
842 v->label = xstrdup (label);
847 if (weight_name != NULL)
849 struct variable *weight_var = dict_lookup_var (ext->dict, weight_name);
850 if (weight_var == NULL)
851 lose ((h, _("Weighting variable %s not present in dictionary."),
855 dict_set_weight (ext->dict, weight_var);
865 /* Parse a value for variable VV into value V. Returns success. */
867 parse_value (struct file_handle *h, union value *v, struct variable *vv)
869 if (vv->type == ALPHA)
871 char *mv = read_string (h);
877 strncpy (v->s, mv, 8);
878 for (j = 0; j < 8; j++)
880 v->s[j] = spss2ascii[v->s[j]];
882 /* Value labels are always padded with spaces. */
887 v->f = read_float (h);
888 if (v->f == second_lowest_value)
895 /* Parse a value label record and return success. */
897 read_value_label (struct file_handle *h)
899 struct pfm_fhuser_ext *ext = h->ext;
914 v = xmalloc (sizeof *v * nv);
915 for (i = 0; i < nv; i++)
917 char *name = read_string (h);
922 v[i] = dict_lookup_var (ext->dict, name);
924 lose ((h, _("Unknown variable %s while parsing value labels."), name));
926 if (v[0]->width != v[i]->width)
927 lose ((h, _("Cannot assign value labels to %s and %s, which "
928 "have different variable types or widths."),
929 v[0]->name, v[i]->name));
932 n_labels = read_int (h);
933 if (n_labels == NOT_INT)
936 for (i = 0; i < n_labels; i++)
943 if (!parse_value (h, &val, v[0]))
946 label = read_string (h);
951 /* Assign the value_label's to each variable. */
952 for (j = 0; j < nv; j++)
954 struct variable *var = v[j];
956 if (!val_labs_replace (var->val_labs, val, label))
959 if (var->type == NUMERIC)
960 lose ((h, _("Duplicate label for value %g for variable %s."),
963 lose ((h, _("Duplicate label for value `%.*s' for variable %s."),
964 var->width, val.s, var->name));
975 /* Reads one case from portable file H into the value array PERM
976 according to the instuctions given in associated dictionary DICT,
977 which must have the get.fv elements appropriately set. Returns
978 nonzero only if successful. */
980 pfm_read_case (struct file_handle *h, union value *perm, struct dictionary *dict)
982 struct pfm_fhuser_ext *ext = h->ext;
984 union value *temp, *tp;
987 /* Check for end of file. */
988 if (ext->cc == 99 /* Z */)
991 /* The first concern is to obtain a full case relative to the data
992 file. (Cases in the data file have no particular relationship to
993 cases in the active file.) */
994 tp = temp = local_alloc (sizeof *tp * ext->case_size);
995 for (tp = temp, i = 0; i < ext->nvars; i++)
996 if (ext->vars[i] == 0)
998 tp->f = read_float (h);
999 if (tp->f == second_lowest_value)
1000 goto unexpected_eof;
1005 char *s = read_string (h);
1007 goto unexpected_eof;
1010 st_bare_pad_copy (tp->s, s, ext->vars[i]);
1011 tp += DIV_RND_UP (ext->vars[i], MAX_SHORT_STRING);
1014 /* Translate a case in data file format to a case in active file
1016 for (i = 0; i < dict_get_var_cnt (dict); i++)
1018 struct variable *v = dict_get_var (dict, i);
1020 if (v->get.fv == -1)
1023 if (v->type == NUMERIC)
1024 perm[v->fv].f = temp[v->get.fv].f;
1026 memcpy (&perm[v->fv].s, &temp[v->get.fv], v->width);
1033 lose ((h, _("End of file midway through case.")));
1040 static struct fh_ext_class pfm_r_class =
1043 N_("reading as a portable file"),