1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
4 Code for parsing floating-point numbers adapted from GNU C
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include "por-file-reader.h"
24 #include <libpspp/message.h>
32 #include <libpspp/alloc.h>
35 #include <libpspp/compiler.h>
36 #include "dictionary.h"
37 #include "file-handle-def.h"
39 #include <libpspp/hash.h>
40 #include <libpspp/magic.h>
41 #include <libpspp/misc.h>
42 #include <libpspp/pool.h>
43 #include <libpspp/str.h>
44 #include "value-labels.h"
48 #define _(msgid) gettext (msgid)
50 /* portable_to_local[PORTABLE] translates the given portable
51 character into the local character set. */
52 static const char portable_to_local[256] =
55 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
56 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
60 /* Portable file reader. */
63 struct pool *pool; /* All the portable file state. */
65 jmp_buf bail_out; /* longjmp() target for error handling. */
67 struct file_handle *fh; /* File handle. */
68 FILE *file; /* File stream. */
69 char cc; /* Current character. */
70 char *trans; /* 256-byte character set translation table. */
71 int var_cnt; /* Number of variables. */
72 int weight_index; /* 0-based index of weight variable, or -1. */
73 int *widths; /* Variable widths, 0 for numeric. */
74 int value_cnt; /* Number of `value's per case. */
75 bool ok; /* Set false on I/O error. */
79 error (struct pfm_reader *r, const char *msg,...)
83 /* Displays MSG as an error message and aborts reading the
84 portable file via longjmp(). */
86 error (struct pfm_reader *r, const char *msg, ...)
92 ds_init_empty (&text);
93 ds_put_format (&text, _("portable file %s corrupt at offset %ld: "),
94 fh_get_file_name (r->fh), ftell (r->file));
96 ds_put_vformat (&text, msg, args);
99 m.category = MSG_GENERAL;
100 m.severity = MSG_ERROR;
101 m.where.file_name = NULL;
102 m.where.line_number = 0;
103 m.text = ds_cstr (&text);
109 longjmp (r->bail_out, 1);
112 /* Closes portable file reader R, after we're done with it. */
114 pfm_close_reader (struct pfm_reader *r)
117 pool_destroy (r->pool);
120 /* Read a single character into cur_char. */
122 advance (struct pfm_reader *r)
126 while ((c = getc (r->file)) == '\r' || c == '\n')
129 error (r, _("unexpected end of file"));
131 if (r->trans != NULL)
136 /* Skip a single character if present, and return whether it was
139 match (struct pfm_reader *r, int c)
150 static void read_header (struct pfm_reader *);
151 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
152 static void read_variables (struct pfm_reader *, struct dictionary *);
153 static void read_value_label (struct pfm_reader *, struct dictionary *);
154 void dump_dictionary (struct dictionary *);
156 /* Reads the dictionary from file with handle H, and returns it in a
157 dictionary structure. This dictionary may be modified in order to
158 rename, reorder, and delete variables, etc. */
160 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
161 struct pfm_read_info *info)
163 struct pool *volatile pool = NULL;
164 struct pfm_reader *volatile r = NULL;
166 *dict = dict_create ();
167 if (!fh_open (fh, FH_REF_FILE, "portable file", "rs"))
170 /* Create and initialize reader. */
171 pool = pool_create ();
172 r = pool_alloc (pool, sizeof *r);
174 if (setjmp (r->bail_out))
177 r->file = pool_fopen (r->pool, fh_get_file_name (r->fh), "rb");
178 r->weight_index = -1;
185 /* Check that file open succeeded, prime reading. */
188 msg (ME, _("An error occurred while opening \"%s\" for reading "
189 "as a portable file: %s."),
190 fh_get_file_name (r->fh), strerror (errno));
194 /* Read header, version, date info, product id, variables. */
196 read_version_data (r, info);
197 read_variables (r, *dict);
199 /* Read value labels. */
200 while (match (r, 'D'))
201 read_value_label (r, *dict);
203 /* Check that we've made it to the data. */
205 error (r, _("Data record expected."));
210 pfm_close_reader (r);
211 dict_destroy (*dict);
216 /* Returns the value of base-30 digit C,
217 or -1 if C is not a base-30 digit. */
219 base_30_value (unsigned char c)
221 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
222 const char *p = strchr (base_30_digits, c);
223 return p != NULL ? p - base_30_digits : -1;
226 /* Read a floating point value and return its value. */
228 read_float (struct pfm_reader *r)
232 bool got_dot = false; /* Seen a decimal point? */
233 bool got_digit = false; /* Seen any digits? */
234 bool negative = false; /* Number is negative? */
236 /* Skip leading spaces. */
237 while (match (r, ' '))
240 /* `*' indicates system-missing. */
243 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
247 negative = match (r, '-');
250 int digit = base_30_value (r->cc);
255 /* Make sure that multiplication by 30 will not overflow. */
256 if (num > DBL_MAX * (1. / 30.))
257 /* The value of the digit doesn't matter, since we have already
258 gotten as many digits as can be represented in a `double'.
259 This doesn't necessarily mean the result will overflow.
260 The exponent may reduce it to within range.
262 We just need to record that there was another
263 digit so that we can multiply by 10 later. */
266 num = (num * 30.0) + digit;
268 /* Keep track of the number of digits after the decimal point.
269 If we just divided by 30 here, we would lose precision. */
273 else if (!got_dot && r->cc == '.')
274 /* Record that we have found the decimal point. */
277 /* Any other character terminates the number. */
283 /* Check that we had some digits. */
285 error (r, "Number expected.");
287 /* Get exponent if any. */
288 if (r->cc == '+' || r->cc == '-')
291 bool negative_exponent = r->cc == '-';
294 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
296 if (exp > LONG_MAX / 30)
301 exp = exp * 30 + digit;
304 /* We don't check whether there were actually any digits, but we
306 if (negative_exponent)
311 /* Numbers must end with `/'. */
313 error (r, _("Missing numeric terminator."));
315 /* Multiply `num' by 30 to the `exponent' power, checking for
318 num *= pow (30.0, (double) exponent);
319 else if (exponent > 0)
321 if (num > DBL_MAX * pow (30.0, (double) -exponent))
324 num *= pow (30.0, (double) exponent);
327 return negative ? -num : num;
330 /* Read an integer and return its value. */
332 read_int (struct pfm_reader *r)
334 double f = read_float (r);
335 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
336 error (r, _("Invalid integer."));
340 /* Reads a string into BUF, which must have room for 256
343 read_string (struct pfm_reader *r, char *buf)
345 int n = read_int (r);
346 if (n < 0 || n > 255)
347 error (r, _("Bad string length %d."), n);
357 /* Reads a string and returns a copy of it allocated from R's
360 read_pool_string (struct pfm_reader *r)
363 read_string (r, string);
364 return pool_strdup (r->pool, string);
367 /* Reads the 464-byte file header. */
369 read_header (struct pfm_reader *r)
374 /* Read and ignore vanity splash strings. */
375 for (i = 0; i < 200; i++)
378 /* Skip the first 64 characters of the translation table.
379 We don't care about these. They are probably all set to
380 '0', marking them as untranslatable, and that would screw
381 up our actual translation of the real '0'. */
382 for (i = 0; i < 64; i++)
385 /* Read the rest of the translation table. */
386 trans = pool_malloc (r->pool, 256);
387 memset (trans, 0, 256);
396 trans[c] = portable_to_local[i];
399 /* Set up the translation table, then read the first
400 translated character. */
404 /* Skip and verify signature. */
405 for (i = 0; i < 8; i++)
406 if (!match (r, "SPSSPORT"[i]))
408 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
409 longjmp (r->bail_out, 1);
413 /* Reads the version and date info record, as well as product and
414 subproduct identification records if present. */
416 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
418 static char empty_string[] = "";
419 char *date, *time, *product, *author, *subproduct;
424 error (r, "Unrecognized version code `%c'.", r->cc);
425 date = read_pool_string (r);
426 time = read_pool_string (r);
427 product = match (r, '1') ? read_pool_string (r) : empty_string;
428 author = match (r, '2') ? read_pool_string (r) : empty_string;
429 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
432 if (strlen (date) != 8)
433 error (r, _("Bad date string length %d."), strlen (date));
434 if (strlen (time) != 6)
435 error (r, _("Bad time string length %d."), strlen (time));
437 /* Save file info. */
441 for (i = 0; i < 8; i++)
443 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
444 info->creation_date[map[i]] = date[i];
446 info->creation_date[2] = info->creation_date[5] = ' ';
447 info->creation_date[10] = 0;
450 for (i = 0; i < 6; i++)
452 static const int map[] = {0, 1, 3, 4, 6, 7};
453 info->creation_time[map[i]] = time[i];
455 info->creation_time[2] = info->creation_time[5] = ' ';
456 info->creation_time[8] = 0;
459 str_copy_trunc (info->product, sizeof info->product, product);
460 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
464 /* Translates a format specification read from portable file R as
465 the three integers INTS into a normal format specifier FORMAT,
466 checking that the format is appropriate for variable V. */
467 static struct fmt_spec
468 convert_format (struct pfm_reader *r, const int portable_format[3],
471 struct fmt_spec format;
474 if (!fmt_from_io (portable_format[0], &format.type))
475 error (r, _("%s: Bad format specifier byte (%d)."),
476 var_get_name (v), portable_format[0]);
477 format.w = portable_format[1];
478 format.d = portable_format[2];
481 ok = (fmt_check_output (&format)
482 && fmt_check_width_compat (&format, var_get_width (v)));
487 char fmt_string[FMT_STRING_LEN_MAX + 1];
488 error (r, _("%s variable %s has invalid format specifier %s."),
489 var_is_numeric (v) ? _("Numeric") : _("String"),
490 var_get_name (v), fmt_to_string (&format, fmt_string));
491 format = fmt_default_for_width (var_get_width (v));
497 static union value parse_value (struct pfm_reader *, struct variable *);
499 /* Read information on all the variables. */
501 read_variables (struct pfm_reader *r, struct dictionary *dict)
503 char *weight_name = NULL;
507 error (r, _("Expected variable count record."));
509 r->var_cnt = read_int (r);
510 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
511 error (r, _("Invalid number of variables %d."), r->var_cnt);
512 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
514 /* Purpose of this value is unknown. It is typically 161. */
519 weight_name = read_pool_string (r);
520 if (strlen (weight_name) > SHORT_NAME_LEN)
521 error (r, _("Weight variable name (%s) truncated."), weight_name);
524 for (i = 0; i < r->var_cnt; i++)
530 struct missing_values miss;
531 struct fmt_spec print, write;
535 error (r, _("Expected variable record."));
537 width = read_int (r);
539 error (r, _("Invalid variable width %d."), width);
540 r->widths[i] = width;
542 read_string (r, name);
543 for (j = 0; j < 6; j++)
544 fmt[j] = read_int (r);
546 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
547 error (r, _("position %d: Invalid variable name `%s'."), i, name);
548 str_uppercase (name);
550 if (width < 0 || width > 255)
551 error (r, "Bad width %d for variable %s.", width, name);
553 v = dict_create_var (dict, name, width);
555 error (r, _("Duplicate variable name %s."), name);
557 print = convert_format (r, &fmt[0], v);
558 write = convert_format (r, &fmt[3], v);
559 var_set_print_format (v, &print);
560 var_set_write_format (v, &write);
562 /* Range missing values. */
563 mv_init (&miss, var_get_width (v));
566 double x = read_float (r);
567 double y = read_float (r);
568 mv_add_num_range (&miss, x, y);
570 else if (match (r, 'A'))
571 mv_add_num_range (&miss, read_float (r), HIGHEST);
572 else if (match (r, '9'))
573 mv_add_num_range (&miss, LOWEST, read_float (r));
575 /* Single missing values. */
576 while (match (r, '8'))
578 union value value = parse_value (r, v);
579 mv_add_value (&miss, &value);
582 var_set_missing_values (v, &miss);
587 read_string (r, label);
588 var_set_label (v, label);
592 if (weight_name != NULL)
594 struct variable *weight_var = dict_lookup_var (dict, weight_name);
595 if (weight_var == NULL)
596 error (r, _("Weighting variable %s not present in dictionary."),
599 dict_set_weight (dict, weight_var);
603 /* Parse a value for variable VV into value V. */
605 parse_value (struct pfm_reader *r, struct variable *vv)
609 if (var_is_alpha (vv))
612 read_string (r, string);
613 buf_copy_str_rpad (v.s, 8, string);
616 v.f = read_float (r);
621 /* Parse a value label record and return success. */
623 read_value_label (struct pfm_reader *r, struct dictionary *dict)
635 v = pool_nalloc (r->pool, nv, sizeof *v);
636 for (i = 0; i < nv; i++)
639 read_string (r, name);
641 v[i] = dict_lookup_var (dict, name);
643 error (r, _("Unknown variable %s while parsing value labels."), name);
645 if (var_get_width (v[0]) != var_get_width (v[i]))
646 error (r, _("Cannot assign value labels to %s and %s, which "
647 "have different variable types or widths."),
648 var_get_name (v[0]), var_get_name (v[i]));
651 n_labels = read_int (r);
652 for (i = 0; i < n_labels; i++)
658 val = parse_value (r, v[0]);
659 read_string (r, label);
661 /* Assign the value_label's to each variable. */
662 for (j = 0; j < nv; j++)
664 struct variable *var = v[j];
666 if (!val_labs_replace (var->val_labs, val, label))
669 if (var_is_numeric (var))
670 error (r, _("Duplicate label for value %g for variable %s."),
671 val.f, var_get_name (var));
673 error (r, _("Duplicate label for value `%.*s' for variable %s."),
674 var_get_width (var), val.s, var_get_name (var));
679 /* Reads one case from portable file R into C. */
681 pfm_read_case (struct pfm_reader *r, struct ccase *c)
686 setjmp (r->bail_out);
690 /* Check for end of file. */
695 for (i = 0; i < r->var_cnt; i++)
697 int width = r->widths[i];
701 case_data_rw (c, idx)->f = read_float (r);
707 read_string (r, string);
708 buf_copy_str_rpad (case_data_rw (c, idx)->s, width, string);
709 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
716 /* Returns true if an I/O error has occurred on READER, false
719 pfm_read_error (const struct pfm_reader *reader)
724 /* Returns true if FILE is an SPSS portable file,
727 pfm_detect (FILE *file)
729 unsigned char header[464];
731 int cooked_cnt, raw_cnt;
734 cooked_cnt = raw_cnt = 0;
735 while (cooked_cnt < sizeof header)
738 if (c == EOF || raw_cnt++ > 512)
740 else if (c != '\n' && c != '\r')
741 header[cooked_cnt++] = c;
744 memset (trans, 0, 256);
745 for (i = 64; i < 256; i++)
747 unsigned char c = header[i + 200];
749 trans[c] = portable_to_local[i];
752 for (i = 0; i < 8; i++)
753 if (trans[header[i + 456]] != "SPSSPORT"[i])