1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
4 Code for parsing floating-point numbers adapted from GNU C
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include "por-file-reader.h"
24 #include <libpspp/message.h>
32 #include <libpspp/alloc.h>
35 #include <libpspp/compiler.h>
36 #include "dictionary.h"
37 #include "file-handle-def.h"
39 #include "missing-values.h"
40 #include <libpspp/hash.h>
41 #include <libpspp/magic.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/pool.h>
44 #include <libpspp/str.h>
45 #include "value-labels.h"
49 #define _(msgid) gettext (msgid)
51 /* portable_to_local[PORTABLE] translates the given portable
52 character into the local character set. */
53 static const char portable_to_local[256] =
56 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
57 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
61 /* Portable file reader. */
64 struct pool *pool; /* All the portable file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
68 struct file_handle *fh; /* File handle. */
69 FILE *file; /* File stream. */
70 char cc; /* Current character. */
71 char *trans; /* 256-byte character set translation table. */
72 int var_cnt; /* Number of variables. */
73 int weight_index; /* 0-based index of weight variable, or -1. */
74 int *widths; /* Variable widths, 0 for numeric. */
75 int value_cnt; /* Number of `value's per case. */
76 bool ok; /* Set false on I/O error. */
80 error (struct pfm_reader *r, const char *msg,...)
84 /* Displays MSG as an error message and aborts reading the
85 portable file via longjmp(). */
87 error (struct pfm_reader *r, const char *msg, ...)
93 ds_init_empty (&text);
94 ds_put_format (&text, _("portable file %s corrupt at offset %ld: "),
95 fh_get_file_name (r->fh), ftell (r->file));
97 ds_put_vformat (&text, msg, args);
100 m.category = MSG_GENERAL;
101 m.severity = MSG_ERROR;
102 m.where.file_name = NULL;
103 m.where.line_number = 0;
104 m.text = ds_cstr (&text);
110 longjmp (r->bail_out, 1);
113 /* Closes portable file reader R, after we're done with it. */
115 pfm_close_reader (struct pfm_reader *r)
118 pool_destroy (r->pool);
121 /* Read a single character into cur_char. */
123 advance (struct pfm_reader *r)
127 while ((c = getc (r->file)) == '\r' || c == '\n')
130 error (r, _("unexpected end of file"));
132 if (r->trans != NULL)
137 /* Skip a single character if present, and return whether it was
140 match (struct pfm_reader *r, int c)
151 static void read_header (struct pfm_reader *);
152 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
153 static void read_variables (struct pfm_reader *, struct dictionary *);
154 static void read_value_label (struct pfm_reader *, struct dictionary *);
155 void dump_dictionary (struct dictionary *);
157 /* Reads the dictionary from file with handle H, and returns it in a
158 dictionary structure. This dictionary may be modified in order to
159 rename, reorder, and delete variables, etc. */
161 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
162 struct pfm_read_info *info)
164 struct pool *volatile pool = NULL;
165 struct pfm_reader *volatile r = NULL;
167 *dict = dict_create ();
168 if (!fh_open (fh, FH_REF_FILE, "portable file", "rs"))
171 /* Create and initialize reader. */
172 pool = pool_create ();
173 r = pool_alloc (pool, sizeof *r);
175 if (setjmp (r->bail_out))
178 r->file = pool_fopen (r->pool, fh_get_file_name (r->fh), "rb");
179 r->weight_index = -1;
186 /* Check that file open succeeded, prime reading. */
189 msg (ME, _("An error occurred while opening \"%s\" for reading "
190 "as a portable file: %s."),
191 fh_get_file_name (r->fh), strerror (errno));
195 /* Read header, version, date info, product id, variables. */
197 read_version_data (r, info);
198 read_variables (r, *dict);
200 /* Read value labels. */
201 while (match (r, 'D'))
202 read_value_label (r, *dict);
204 /* Check that we've made it to the data. */
206 error (r, _("Data record expected."));
211 pfm_close_reader (r);
212 dict_destroy (*dict);
217 /* Returns the value of base-30 digit C,
218 or -1 if C is not a base-30 digit. */
220 base_30_value (unsigned char c)
222 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
223 const char *p = strchr (base_30_digits, c);
224 return p != NULL ? p - base_30_digits : -1;
227 /* Read a floating point value and return its value. */
229 read_float (struct pfm_reader *r)
233 bool got_dot = false; /* Seen a decimal point? */
234 bool got_digit = false; /* Seen any digits? */
235 bool negative = false; /* Number is negative? */
237 /* Skip leading spaces. */
238 while (match (r, ' '))
241 /* `*' indicates system-missing. */
244 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
248 negative = match (r, '-');
251 int digit = base_30_value (r->cc);
256 /* Make sure that multiplication by 30 will not overflow. */
257 if (num > DBL_MAX * (1. / 30.))
258 /* The value of the digit doesn't matter, since we have already
259 gotten as many digits as can be represented in a `double'.
260 This doesn't necessarily mean the result will overflow.
261 The exponent may reduce it to within range.
263 We just need to record that there was another
264 digit so that we can multiply by 10 later. */
267 num = (num * 30.0) + digit;
269 /* Keep track of the number of digits after the decimal point.
270 If we just divided by 30 here, we would lose precision. */
274 else if (!got_dot && r->cc == '.')
275 /* Record that we have found the decimal point. */
278 /* Any other character terminates the number. */
284 /* Check that we had some digits. */
286 error (r, "Number expected.");
288 /* Get exponent if any. */
289 if (r->cc == '+' || r->cc == '-')
292 bool negative_exponent = r->cc == '-';
295 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
297 if (exp > LONG_MAX / 30)
302 exp = exp * 30 + digit;
305 /* We don't check whether there were actually any digits, but we
307 if (negative_exponent)
312 /* Numbers must end with `/'. */
314 error (r, _("Missing numeric terminator."));
316 /* Multiply `num' by 30 to the `exponent' power, checking for
319 num *= pow (30.0, (double) exponent);
320 else if (exponent > 0)
322 if (num > DBL_MAX * pow (30.0, (double) -exponent))
325 num *= pow (30.0, (double) exponent);
328 return negative ? -num : num;
331 /* Read an integer and return its value. */
333 read_int (struct pfm_reader *r)
335 double f = read_float (r);
336 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
337 error (r, _("Invalid integer."));
341 /* Reads a string into BUF, which must have room for 256
344 read_string (struct pfm_reader *r, char *buf)
346 int n = read_int (r);
347 if (n < 0 || n > 255)
348 error (r, _("Bad string length %d."), n);
358 /* Reads a string and returns a copy of it allocated from R's
361 read_pool_string (struct pfm_reader *r)
364 read_string (r, string);
365 return pool_strdup (r->pool, string);
368 /* Reads the 464-byte file header. */
370 read_header (struct pfm_reader *r)
375 /* Read and ignore vanity splash strings. */
376 for (i = 0; i < 200; i++)
379 /* Skip the first 64 characters of the translation table.
380 We don't care about these. They are probably all set to
381 '0', marking them as untranslatable, and that would screw
382 up our actual translation of the real '0'. */
383 for (i = 0; i < 64; i++)
386 /* Read the rest of the translation table. */
387 trans = pool_malloc (r->pool, 256);
388 memset (trans, 0, 256);
397 trans[c] = portable_to_local[i];
400 /* Set up the translation table, then read the first
401 translated character. */
405 /* Skip and verify signature. */
406 for (i = 0; i < 8; i++)
407 if (!match (r, "SPSSPORT"[i]))
409 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
410 longjmp (r->bail_out, 1);
414 /* Reads the version and date info record, as well as product and
415 subproduct identification records if present. */
417 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
419 static char empty_string[] = "";
420 char *date, *time, *product, *author, *subproduct;
425 error (r, "Unrecognized version code `%c'.", r->cc);
426 date = read_pool_string (r);
427 time = read_pool_string (r);
428 product = match (r, '1') ? read_pool_string (r) : empty_string;
429 author = match (r, '2') ? read_pool_string (r) : empty_string;
430 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
433 if (strlen (date) != 8)
434 error (r, _("Bad date string length %d."), strlen (date));
435 if (strlen (time) != 6)
436 error (r, _("Bad time string length %d."), strlen (time));
438 /* Save file info. */
442 for (i = 0; i < 8; i++)
444 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
445 info->creation_date[map[i]] = date[i];
447 info->creation_date[2] = info->creation_date[5] = ' ';
448 info->creation_date[10] = 0;
451 for (i = 0; i < 6; i++)
453 static const int map[] = {0, 1, 3, 4, 6, 7};
454 info->creation_time[map[i]] = time[i];
456 info->creation_time[2] = info->creation_time[5] = ' ';
457 info->creation_time[8] = 0;
460 str_copy_trunc (info->product, sizeof info->product, product);
461 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
465 /* Translates a format specification read from portable file R as
466 the three integers INTS into a normal format specifier FORMAT,
467 checking that the format is appropriate for variable V. */
468 static struct fmt_spec
469 convert_format (struct pfm_reader *r, const int portable_format[3],
472 struct fmt_spec format;
475 if (!fmt_from_io (portable_format[0], &format.type))
476 error (r, _("%s: Bad format specifier byte (%d)."),
477 var_get_name (v), portable_format[0]);
478 format.w = portable_format[1];
479 format.d = portable_format[2];
482 ok = (fmt_check_output (&format)
483 && fmt_check_width_compat (&format, var_get_width (v)));
488 char fmt_string[FMT_STRING_LEN_MAX + 1];
489 error (r, _("%s variable %s has invalid format specifier %s."),
490 var_is_numeric (v) ? _("Numeric") : _("String"),
491 var_get_name (v), fmt_to_string (&format, fmt_string));
492 format = fmt_default_for_width (var_get_width (v));
498 static union value parse_value (struct pfm_reader *, struct variable *);
500 /* Read information on all the variables. */
502 read_variables (struct pfm_reader *r, struct dictionary *dict)
504 char *weight_name = NULL;
508 error (r, _("Expected variable count record."));
510 r->var_cnt = read_int (r);
511 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
512 error (r, _("Invalid number of variables %d."), r->var_cnt);
513 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
515 /* Purpose of this value is unknown. It is typically 161. */
520 weight_name = read_pool_string (r);
521 if (strlen (weight_name) > SHORT_NAME_LEN)
522 error (r, _("Weight variable name (%s) truncated."), weight_name);
525 for (i = 0; i < r->var_cnt; i++)
531 struct missing_values miss;
532 struct fmt_spec print, write;
536 error (r, _("Expected variable record."));
538 width = read_int (r);
540 error (r, _("Invalid variable width %d."), width);
541 r->widths[i] = width;
543 read_string (r, name);
544 for (j = 0; j < 6; j++)
545 fmt[j] = read_int (r);
547 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
548 error (r, _("position %d: Invalid variable name `%s'."), i, name);
549 str_uppercase (name);
551 if (width < 0 || width > 255)
552 error (r, "Bad width %d for variable %s.", width, name);
554 v = dict_create_var (dict, name, width);
556 error (r, _("Duplicate variable name %s."), name);
558 print = convert_format (r, &fmt[0], v);
559 write = convert_format (r, &fmt[3], v);
560 var_set_print_format (v, &print);
561 var_set_write_format (v, &write);
563 /* Range missing values. */
564 mv_init (&miss, var_get_width (v));
567 double x = read_float (r);
568 double y = read_float (r);
569 mv_add_num_range (&miss, x, y);
571 else if (match (r, 'A'))
572 mv_add_num_range (&miss, read_float (r), HIGHEST);
573 else if (match (r, '9'))
574 mv_add_num_range (&miss, LOWEST, read_float (r));
576 /* Single missing values. */
577 while (match (r, '8'))
579 union value value = parse_value (r, v);
580 mv_add_value (&miss, &value);
583 var_set_missing_values (v, &miss);
588 read_string (r, label);
589 var_set_label (v, label);
593 if (weight_name != NULL)
595 struct variable *weight_var = dict_lookup_var (dict, weight_name);
596 if (weight_var == NULL)
597 error (r, _("Weighting variable %s not present in dictionary."),
600 dict_set_weight (dict, weight_var);
604 /* Parse a value for variable VV into value V. */
606 parse_value (struct pfm_reader *r, struct variable *vv)
610 if (var_is_alpha (vv))
613 read_string (r, string);
614 buf_copy_str_rpad (v.s, 8, string);
617 v.f = read_float (r);
622 /* Parse a value label record and return success. */
624 read_value_label (struct pfm_reader *r, struct dictionary *dict)
636 v = pool_nalloc (r->pool, nv, sizeof *v);
637 for (i = 0; i < nv; i++)
640 read_string (r, name);
642 v[i] = dict_lookup_var (dict, name);
644 error (r, _("Unknown variable %s while parsing value labels."), name);
646 if (var_get_width (v[0]) != var_get_width (v[i]))
647 error (r, _("Cannot assign value labels to %s and %s, which "
648 "have different variable types or widths."),
649 var_get_name (v[0]), var_get_name (v[i]));
652 n_labels = read_int (r);
653 for (i = 0; i < n_labels; i++)
659 val = parse_value (r, v[0]);
660 read_string (r, label);
662 /* Assign the value_label's to each variable. */
663 for (j = 0; j < nv; j++)
665 struct variable *var = v[j];
667 if (!var_add_value_label (var, &val, label))
670 if (var_is_numeric (var))
671 error (r, _("Duplicate label for value %g for variable %s."),
672 val.f, var_get_name (var));
674 error (r, _("Duplicate label for value `%.*s' for variable %s."),
675 var_get_width (var), val.s, var_get_name (var));
680 /* Reads one case from portable file R into C. */
682 pfm_read_case (struct pfm_reader *r, struct ccase *c)
687 setjmp (r->bail_out);
691 /* Check for end of file. */
696 for (i = 0; i < r->var_cnt; i++)
698 int width = r->widths[i];
702 case_data_rw_idx (c, idx)->f = read_float (r);
708 read_string (r, string);
709 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
710 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
717 /* Returns true if an I/O error has occurred on READER, false
720 pfm_read_error (const struct pfm_reader *reader)
725 /* Returns true if FILE is an SPSS portable file,
728 pfm_detect (FILE *file)
730 unsigned char header[464];
732 int cooked_cnt, raw_cnt;
735 cooked_cnt = raw_cnt = 0;
736 while (cooked_cnt < sizeof header)
739 if (c == EOF || raw_cnt++ > 512)
741 else if (c != '\n' && c != '\r')
742 header[cooked_cnt++] = c;
745 memset (trans, 0, 256);
746 for (i = 64; i < 256; i++)
748 unsigned char c = header[i + 200];
750 trans[c] = portable_to_local[i];
753 for (i = 0; i < 8; i++)
754 if (trans[header[i + 456]] != "SPSSPORT"[i])