1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 /* AIX requires this to be the first thing in the file. */
23 #define alloca __builtin_alloca
31 #ifndef alloca /* predefined by HP cc +Olibcalls */
47 #include "file-handle.h"
54 #include "value-labels.h"
57 #include "debug-print.h"
59 /* pfm's file_handle extension. */
62 FILE *file; /* Actual file. */
64 struct dictionary *dict; /* File's dictionary. */
65 int weight_index; /* 0-based index of weight variable, or -1. */
67 unsigned char *trans; /* 256-byte character set translation table. */
69 int nvars; /* Number of variables. */
70 int *vars; /* Variable widths, 0 for numeric. */
71 int case_size; /* Number of `value's per case. */
73 unsigned char buf[83]; /* Input buffer. */
74 unsigned char *bp; /* Buffer pointer. */
75 int cc; /* Current character. */
78 static struct fh_ext_class pfm_r_class;
81 corrupt_msg (struct file_handle *h, const char *format,...)
82 __attribute__ ((format (printf, 2, 3)));
84 /* Displays a corruption error. */
86 corrupt_msg (struct file_handle *h, const char *format, ...)
88 struct pfm_fhuser_ext *ext = h->ext;
94 va_start (args, format);
95 vsnprintf (buf, 1024, format, args);
104 getl_location (&e.where.filename, &e.where.line_number);
105 e.title = title = local_alloc (strlen (h->fn) + 80);
106 sprintf (title, _("portable file %s corrupt at offset %ld: "),
107 h->fn, ftell (ext->file) - (82 - (long) (ext->bp - ext->buf)));
118 /* Closes a portable file after we're done with it. */
120 pfm_close (struct file_handle * h)
122 struct pfm_fhuser_ext *ext = h->ext;
124 if (EOF == fclose (ext->file))
125 msg (ME, _("%s: Closing portable file: %s."), h->fn, strerror (errno));
131 /* Displays the message X with corrupt_msg, then jumps to the lossage
141 /* Read an 80-character line into handle H's buffer. Return
144 fill_buf (struct file_handle *h)
146 struct pfm_fhuser_ext *ext = h->ext;
148 if (80 != fread (ext->buf, 1, 80, ext->file))
149 lose ((h, _("Unexpected end of file.")));
151 /* PORTME: line ends. */
155 c = getc (ext->file);
156 if (c != '\n' && c != '\r')
157 lose ((h, _("Bad line end.")));
159 c = getc (ext->file);
160 if (c != '\n' && c != '\r')
161 ungetc (c, ext->file);
168 for (i = 0; i < 80; i++)
169 ext->buf[i] = ext->trans[ext->buf[i]];
180 /* Read a single character into cur_char. Return success; */
182 read_char (struct file_handle *h)
184 struct pfm_fhuser_ext *ext = h->ext;
186 if (ext->bp >= &ext->buf[80] && !fill_buf (h))
188 ext->cc = *ext->bp++;
192 /* Advance a single character. */
193 #define advance() if (!read_char (h)) goto lossage
195 /* Skip a single character if present, and return whether it was
198 skip_char (struct file_handle *h, int c)
200 struct pfm_fhuser_ext *ext = h->ext;
211 /* Skip a single character if present, and return whether it was
213 #define match(C) skip_char (h, C)
215 static int read_header (struct file_handle *h);
216 static int read_version_data (struct file_handle *h, struct pfm_read_info *inf);
217 static int read_variables (struct file_handle *h);
218 static int read_value_label (struct file_handle *h);
219 void dump_dictionary (struct dictionary *dict);
221 /* Reads the dictionary from file with handle H, and returns it in a
222 dictionary structure. This dictionary may be modified in order to
223 rename, reorder, and delete variables, etc. */
225 pfm_read_dictionary (struct file_handle *h, struct pfm_read_info *inf)
227 /* The file handle extension record. */
228 struct pfm_fhuser_ext *ext;
230 /* Check whether the file is already open. */
231 if (h->class == &pfm_r_class)
236 else if (h->class != NULL)
238 msg (ME, _("Cannot read file %s as portable file: already opened "
240 fh_handle_name (h), h->class->name);
244 msg (VM (1), _("%s: Opening portable-file handle %s for reading."),
245 fh_handle_filename (h), fh_handle_name (h));
247 /* Open the physical disk file. */
248 ext = xmalloc (sizeof (struct pfm_fhuser_ext));
249 ext->file = fopen (h->norm_fn, "rb");
250 if (ext->file == NULL)
252 msg (ME, _("An error occurred while opening \"%s\" for reading "
253 "as a portable file: %s."), h->fn, strerror (errno));
259 /* Initialize the sfm_fhuser_ext structure. */
260 h->class = &pfm_r_class;
268 /* Read the header. */
269 if (!read_header (h))
272 /* Read version, date info, product identification. */
273 if (!read_version_data (h, inf))
276 /* Read variables. */
277 if (!read_variables (h))
281 while (match (77 /* D */))
282 if (!read_value_label (h))
285 if (!match (79 /* F */))
286 lose ((h, _("Data record expected.")));
288 msg (VM (2), _("Read portable-file dictionary successfully."));
291 dump_dictionary (ext->dict);
296 /* Come here on unsuccessful completion. */
297 msg (VM (1), _("Error reading portable-file dictionary."));
300 if (ext && ext->dict)
301 dict_destroy (ext->dict);
308 /* Read a floating point value and return its value, or
309 second_lowest_value on error. */
311 read_float (struct file_handle *h)
313 struct pfm_fhuser_ext *ext = h->ext;
320 /* Skip leading spaces. */
321 while (match (126 /* space */))
324 if (match (137 /* * */))
326 advance (); /* Probably a dot (.) but doesn't appear to matter. */
329 else if (match (141 /* - */))
334 if (ext->cc >= 64 /* 0 */ && ext->cc <= 93 /* T */)
338 /* Make sure that multiplication by 30 will not overflow. */
339 if (num > DBL_MAX * (1. / 30.))
340 /* The value of the digit doesn't matter, since we have already
341 gotten as many digits as can be represented in a `double'.
342 This doesn't necessarily mean the result will overflow.
343 The exponent may reduce it to within range.
345 We just need to record that there was another
346 digit so that we can multiply by 10 later. */
349 num = (num * 30.0) + (ext->cc - 64);
351 /* Keep track of the number of digits after the decimal point.
352 If we just divided by 30 here, we would lose precision. */
356 else if (!got_dot && ext->cc == 127 /* . */)
357 /* Record that we have found the decimal point. */
360 /* Any other character terminates the number. */
367 lose ((h, "Number expected."));
369 if (ext->cc == 130 /* + */ || ext->cc == 141 /* - */)
371 /* Get the exponent. */
373 int neg_exp = ext->cc == 141 /* - */;
379 if (ext->cc < 64 /* 0 */ || ext->cc > 93 /* T */)
382 if (exp > LONG_MAX / 30)
384 exp = exp * 30 + (ext->cc - 64);
387 /* We don't check whether there were actually any digits, but we
394 if (!match (142 /* / */))
395 lose ((h, _("Missing numeric terminator.")));
397 /* Multiply NUM by 30 to the EXPONENT power, checking for overflow. */
400 num *= pow (30.0, (double) exponent);
401 else if (exponent > 0)
403 if (num > DBL_MAX * pow (30.0, (double) -exponent))
405 num *= pow (30.0, (double) exponent);
415 return -DBL_MAX / 10.;
420 return second_lowest_value;
423 /* Read an integer and return its value, or NOT_INT on failure. */
425 read_int (struct file_handle *h)
427 double f = read_float (h);
429 if (f == second_lowest_value)
431 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
432 lose ((h, _("Bad integer format.")));
439 /* Reads a string and returns its value in a static buffer, or NULL on
440 failure. The buffer can be deallocated by calling with a NULL
442 static unsigned char *
443 read_string (struct file_handle *h)
445 struct pfm_fhuser_ext *ext = h->ext;
455 else if (buf == NULL)
461 if (n < 0 || n > 255)
462 lose ((h, _("Bad string length %d."), n));
467 for (i = 0; i < n; i++)
481 /* Reads the 464-byte file header. */
483 read_header (struct file_handle *h)
485 struct pfm_fhuser_ext *ext = h->ext;
487 /* For now at least, just ignore the vanity splash strings. */
491 for (i = 0; i < 200; i++)
496 unsigned char src[256];
500 for (i = 0; i < 256; i++)
502 src[i] = (unsigned char) ext->cc;
506 for (i = 0; i < 256; i++)
509 /* 0 is used to mark untranslatable characters, so we have to mark
511 trans_temp[src[64]] = 64;
512 for (i = 0; i < 256; i++)
513 if (trans_temp[src[i]] == -1)
514 trans_temp[src[i]] = i;
516 ext->trans = xmalloc (256);
517 for (i = 0; i < 256; i++)
518 ext->trans[i] = trans_temp[i] == -1 ? 0 : trans_temp[i];
520 /* Translate the input buffer. */
521 for (i = 0; i < 80; i++)
522 ext->buf[i] = ext->trans[ext->buf[i]];
523 ext->cc = ext->trans[ext->cc];
527 unsigned char sig[8] = {92, 89, 92, 92, 89, 88, 91, 93};
530 for (i = 0; i < 8; i++)
532 lose ((h, "Missing SPSSPORT signature."));
541 /* Reads the version and date info record, as well as product and
542 subproduct identification records if present. */
544 read_version_data (struct file_handle *h, struct pfm_read_info *inf)
546 struct pfm_fhuser_ext *ext = h->ext;
549 if (!match (74 /* A */))
550 lose ((h, "Unrecognized version code %d.", ext->cc));
554 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
555 char *date = read_string (h);
560 if (strlen (date) != 8)
561 lose ((h, _("Bad date string length %d."), strlen (date)));
562 for (i = 0; i < 8; i++)
564 if (date[i] < 64 /* 0 */ || date[i] > 73 /* 9 */)
565 lose ((h, _("Bad character in date.")));
567 inf->creation_date[map[i]] = date[i] - 64 /* 0 */ + '0';
571 inf->creation_date[2] = inf->creation_date[5] = ' ';
572 inf->creation_date[10] = 0;
578 static const int map[] = {0, 1, 3, 4, 6, 7};
579 char *time = read_string (h);
584 if (strlen (time) != 6)
585 lose ((h, _("Bad time string length %d."), strlen (time)));
586 for (i = 0; i < 6; i++)
588 if (time[i] < 64 /* 0 */ || time[i] > 73 /* 9 */)
589 lose ((h, _("Bad character in time.")));
591 inf->creation_time[map[i]] = time[i] - 64 /* 0 */ + '0';
595 inf->creation_time[2] = inf->creation_time[5] = ' ';
596 inf->creation_time[8] = 0;
601 if (match (65 /* 1 */))
605 product = read_string (h);
609 strncpy (inf->product, product, 61);
615 if (match (67 /* 3 */))
619 subproduct = read_string (h);
620 if (subproduct == NULL)
623 strncpy (inf->subproduct, subproduct, 61);
626 inf->subproduct[0] = 0;
634 convert_format (struct file_handle *h, int fmt[3], struct fmt_spec *v,
638 || (size_t) fmt[0] >= sizeof translate_fmt / sizeof *translate_fmt)
639 lose ((h, _("%s: Bad format specifier byte %d."), vv->name, fmt[0]));
641 v->type = translate_fmt[fmt[0]];
645 /* FIXME? Should verify the resulting specifier more thoroughly. */
648 lose ((h, _("%s: Bad format specifier byte (%d)."), vv->name, fmt[0]));
649 if ((vv->type == ALPHA) ^ ((formats[v->type].cat & FCAT_STRING) != 0))
650 lose ((h, _("%s variable %s has %s format specifier %s."),
651 vv->type == ALPHA ? _("String") : _("Numeric"),
653 formats[v->type].cat & FCAT_STRING ? _("string") : _("numeric"),
654 formats[v->type].name));
661 /* Translation table from SPSS character code to this computer's
662 native character code (which is probably ASCII). */
663 static const unsigned char spss2ascii[256] =
666 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
667 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
671 /* Translate string S into ASCII. */
676 *s = spss2ascii[(unsigned char) *s];
679 static int parse_value (struct file_handle *, union value *, struct variable *);
681 /* Read information on all the variables. */
683 read_variables (struct file_handle *h)
685 struct pfm_fhuser_ext *ext = h->ext;
686 char *weight_name = NULL;
689 if (!match (68 /* 4 */))
690 lose ((h, _("Expected variable count record.")));
692 ext->nvars = read_int (h);
693 if (ext->nvars <= 0 || ext->nvars == NOT_INT)
694 lose ((h, _("Invalid number of variables %d."), ext->nvars));
695 ext->vars = xmalloc (sizeof *ext->vars * ext->nvars);
697 /* Purpose of this value is unknown. It is typically 161. */
699 int x = read_int (h);
704 corrupt_msg (h, _("Unexpected flag value %d."), x);
707 ext->dict = dict_create ();
709 if (match (70 /* 6 */))
711 weight_name = read_string (h);
715 asciify (weight_name);
716 if (strlen (weight_name) > 8)
718 corrupt_msg (h, _("Weight variable name (%s) truncated."),
720 weight_name[8] = '\0';
724 for (i = 0; i < ext->nvars; i++)
732 if (!match (71 /* 7 */))
733 lose ((h, _("Expected variable record.")));
735 width = read_int (h);
736 if (width == NOT_INT)
739 lose ((h, _("Invalid variable width %d."), width));
740 ext->vars[i] = width;
742 name = read_string (h);
745 for (j = 0; j < 6; j++)
747 fmt[j] = read_int (h);
748 if (fmt[j] == NOT_INT)
752 /* Verify first character of variable name.
754 Weirdly enough, there is no # character in the SPSS portable
755 character set, so we can't check for it. */
756 if (strlen (name) > 8)
757 lose ((h, _("position %d: Variable name has %u characters."),
759 if ((name[0] < 74 /* A */ || name[0] > 125 /* Z */)
760 && name[0] != 152 /* @ */)
761 lose ((h, _("position %d: Variable name begins with invalid "
763 if (name[0] >= 100 /* a */ && name[0] <= 125 /* z */)
765 corrupt_msg (h, _("position %d: Variable name begins with "
766 "lowercase letter %c."),
767 i, name[0] - 100 + 'a');
768 name[0] -= 26 /* a - A */;
771 /* Verify remaining characters of variable name. */
772 for (j = 1; j < (int) strlen (name); j++)
776 if (c >= 100 /* a */ && c <= 125 /* z */)
778 corrupt_msg (h, _("position %d: Variable name character %d "
779 "is lowercase letter %c."),
780 i, j + 1, c - 100 + 'a');
781 name[j] -= 26 /* z - Z */;
783 else if ((c >= 64 /* 0 */ && c <= 99 /* Z */)
784 || c == 127 /* . */ || c == 152 /* @ */
785 || c == 136 /* $ */ || c == 146 /* _ */)
788 lose ((h, _("position %d: character `\\%03o' is not "
789 "valid in a variable name."), i, c));
793 if (width < 0 || width > 255)
794 lose ((h, "Bad width %d for variable %s.", width, name));
796 v = dict_create_var (ext->dict, name, width);
799 lose ((h, _("Duplicate variable name %s."), name));
800 if (!convert_format (h, &fmt[0], &v->print, v))
802 if (!convert_format (h, &fmt[3], &v->write, v))
805 /* Range missing values. */
806 if (match (75 /* B */))
808 v->miss_type = MISSING_RANGE;
809 if (!parse_value (h, &v->missing[0], v)
810 || !parse_value (h, &v->missing[1], v))
813 else if (match (74 /* A */))
815 v->miss_type = MISSING_HIGH;
816 if (!parse_value (h, &v->missing[0], v))
819 else if (match (73 /* 9 */))
821 v->miss_type = MISSING_LOW;
822 if (!parse_value (h, &v->missing[0], v))
826 /* Single missing values. */
827 while (match (72 /* 8 */))
829 static const int map_next[MISSING_COUNT] =
831 MISSING_1, MISSING_2, MISSING_3, -1,
832 MISSING_RANGE_1, MISSING_LOW_1, MISSING_HIGH_1,
836 static const int map_ofs[MISSING_COUNT] =
838 -1, 0, 1, 2, -1, -1, -1, 2, 1, 1,
841 v->miss_type = map_next[v->miss_type];
842 if (v->miss_type == -1)
843 lose ((h, _("Bad missing values for %s."), v->name));
845 assert (map_ofs[v->miss_type] != -1);
846 if (!parse_value (h, &v->missing[map_ofs[v->miss_type]], v))
850 if (match (76 /* C */))
852 char *label = read_string (h);
857 v->label = xstrdup (label);
862 if (weight_name != NULL)
864 struct variable *weight_var = dict_lookup_var (ext->dict, weight_name);
865 if (weight_var == NULL)
866 lose ((h, _("Weighting variable %s not present in dictionary."),
870 dict_set_weight (ext->dict, weight_var);
880 /* Parse a value for variable VV into value V. Returns success. */
882 parse_value (struct file_handle *h, union value *v, struct variable *vv)
884 if (vv->type == ALPHA)
886 char *mv = read_string (h);
892 strncpy (v->s, mv, 8);
893 for (j = 0; j < 8; j++)
895 v->s[j] = spss2ascii[v->s[j]];
897 /* Value labels are always padded with spaces. */
902 v->f = read_float (h);
903 if (v->f == second_lowest_value)
910 /* Parse a value label record and return success. */
912 read_value_label (struct file_handle *h)
914 struct pfm_fhuser_ext *ext = h->ext;
929 v = xmalloc (sizeof *v * nv);
930 for (i = 0; i < nv; i++)
932 char *name = read_string (h);
937 v[i] = dict_lookup_var (ext->dict, name);
939 lose ((h, _("Unknown variable %s while parsing value labels."), name));
941 if (v[0]->width != v[i]->width)
942 lose ((h, _("Cannot assign value labels to %s and %s, which "
943 "have different variable types or widths."),
944 v[0]->name, v[i]->name));
947 n_labels = read_int (h);
948 if (n_labels == NOT_INT)
951 for (i = 0; i < n_labels; i++)
958 if (!parse_value (h, &val, v[0]))
961 label = read_string (h);
966 /* Assign the value_label's to each variable. */
967 for (j = 0; j < nv; j++)
969 struct variable *var = v[j];
971 if (!val_labs_replace (var->val_labs, val, label))
974 if (var->type == NUMERIC)
975 lose ((h, _("Duplicate label for value %g for variable %s."),
978 lose ((h, _("Duplicate label for value `%.*s' for variable %s."),
979 var->width, val.s, var->name));
990 /* Reads one case from portable file H into the value array PERM
991 according to the instuctions given in associated dictionary DICT,
992 which must have the get.fv elements appropriately set. Returns
993 nonzero only if successful. */
995 pfm_read_case (struct file_handle *h, union value *perm, struct dictionary *dict)
997 struct pfm_fhuser_ext *ext = h->ext;
999 union value *temp, *tp;
1002 /* Check for end of file. */
1003 if (ext->cc == 99 /* Z */)
1006 /* The first concern is to obtain a full case relative to the data
1007 file. (Cases in the data file have no particular relationship to
1008 cases in the active file.) */
1009 tp = temp = local_alloc (sizeof *tp * ext->case_size);
1010 for (tp = temp, i = 0; i < ext->nvars; i++)
1011 if (ext->vars[i] == 0)
1013 tp->f = read_float (h);
1014 if (tp->f == second_lowest_value)
1015 goto unexpected_eof;
1020 char *s = read_string (h);
1022 goto unexpected_eof;
1025 st_bare_pad_copy (tp->s, s, ext->vars[i]);
1026 tp += DIV_RND_UP (ext->vars[i], MAX_SHORT_STRING);
1029 /* Translate a case in data file format to a case in active file
1031 for (i = 0; i < dict_get_var_cnt (dict); i++)
1033 struct variable *v = dict_get_var (dict, i);
1035 if (v->get.fv == -1)
1038 if (v->type == NUMERIC)
1039 perm[v->fv].f = temp[v->get.fv].f;
1041 memcpy (&perm[v->fv].s, &temp[v->get.fv], v->width);
1048 lose ((h, _("End of file midway through case.")));
1055 static struct fh_ext_class pfm_r_class =
1058 N_("reading as a portable file"),