1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Code for parsing floating-point numbers adapted from GNU C
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include "por-file-reader.h"
33 #include <data/casereader-provider.h>
34 #include <data/casereader.h>
35 #include <data/dictionary.h>
36 #include <data/file-handle-def.h>
37 #include <data/format.h>
38 #include <data/missing-values.h>
39 #include <data/value-labels.h>
40 #include <data/variable.h>
41 #include <libpspp/alloc.h>
42 #include <libpspp/compiler.h>
43 #include <libpspp/hash.h>
44 #include <libpspp/magic.h>
45 #include <libpspp/message.h>
46 #include <libpspp/misc.h>
47 #include <libpspp/pool.h>
48 #include <libpspp/str.h>
51 #define _(msgid) gettext (msgid)
53 /* portable_to_local[PORTABLE] translates the given portable
54 character into the local character set. */
55 static const char portable_to_local[256] =
58 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
59 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
63 /* Portable file reader. */
66 struct pool *pool; /* All the portable file state. */
68 jmp_buf bail_out; /* longjmp() target for error handling. */
70 struct file_handle *fh; /* File handle. */
71 FILE *file; /* File stream. */
72 char cc; /* Current character. */
73 char *trans; /* 256-byte character set translation table. */
74 int var_cnt; /* Number of variables. */
75 int weight_index; /* 0-based index of weight variable, or -1. */
76 int *widths; /* Variable widths, 0 for numeric. */
77 size_t value_cnt; /* Number of `value's per case. */
78 bool ok; /* Set false on I/O error. */
81 static struct casereader_class por_file_casereader_class;
84 error (struct pfm_reader *r, const char *msg,...)
88 /* Displays MSG as an error message and aborts reading the
89 portable file via longjmp(). */
91 error (struct pfm_reader *r, const char *msg, ...)
97 ds_init_empty (&text);
98 ds_put_format (&text, _("portable file %s corrupt at offset %ld: "),
99 fh_get_file_name (r->fh), ftell (r->file));
100 va_start (args, msg);
101 ds_put_vformat (&text, msg, args);
104 m.category = MSG_GENERAL;
105 m.severity = MSG_ERROR;
106 m.where.file_name = NULL;
107 m.where.line_number = 0;
108 m.text = ds_cstr (&text);
114 longjmp (r->bail_out, 1);
117 /* Closes portable file reader R, after we're done with it. */
119 por_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
121 struct pfm_reader *r = r_;
122 pool_destroy (r->pool);
125 /* Read a single character into cur_char. */
127 advance (struct pfm_reader *r)
131 while ((c = getc (r->file)) == '\r' || c == '\n')
134 error (r, _("unexpected end of file"));
136 if (r->trans != NULL)
141 /* Skip a single character if present, and return whether it was
144 match (struct pfm_reader *r, int c)
155 static void read_header (struct pfm_reader *);
156 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
157 static void read_variables (struct pfm_reader *, struct dictionary *);
158 static void read_value_label (struct pfm_reader *, struct dictionary *);
159 void dump_dictionary (struct dictionary *);
161 /* Reads the dictionary from file with handle H, and returns it in a
162 dictionary structure. This dictionary may be modified in order to
163 rename, reorder, and delete variables, etc. */
165 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
166 struct pfm_read_info *info)
168 struct pool *volatile pool = NULL;
169 struct pfm_reader *volatile r = NULL;
171 *dict = dict_create ();
172 if (!fh_open (fh, FH_REF_FILE, "portable file", "rs"))
175 /* Create and initialize reader. */
176 pool = pool_create ();
177 r = pool_alloc (pool, sizeof *r);
179 if (setjmp (r->bail_out))
182 r->file = pool_fopen (r->pool, fh_get_file_name (r->fh), "rb");
183 r->weight_index = -1;
190 /* Check that file open succeeded, prime reading. */
193 msg (ME, _("An error occurred while opening \"%s\" for reading "
194 "as a portable file: %s."),
195 fh_get_file_name (r->fh), strerror (errno));
199 /* Read header, version, date info, product id, variables. */
201 read_version_data (r, info);
202 read_variables (r, *dict);
204 /* Read value labels. */
205 while (match (r, 'D'))
206 read_value_label (r, *dict);
208 /* Check that we've made it to the data. */
210 error (r, _("Data record expected."));
212 r->value_cnt = dict_get_next_value_idx (*dict);
213 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
214 &por_file_casereader_class, r);
217 pool_destroy (r->pool);
218 dict_destroy (*dict);
223 /* Returns the value of base-30 digit C,
224 or -1 if C is not a base-30 digit. */
226 base_30_value (unsigned char c)
228 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
229 const char *p = strchr (base_30_digits, c);
230 return p != NULL ? p - base_30_digits : -1;
233 /* Read a floating point value and return its value. */
235 read_float (struct pfm_reader *r)
239 bool got_dot = false; /* Seen a decimal point? */
240 bool got_digit = false; /* Seen any digits? */
241 bool negative = false; /* Number is negative? */
243 /* Skip leading spaces. */
244 while (match (r, ' '))
247 /* `*' indicates system-missing. */
250 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
254 negative = match (r, '-');
257 int digit = base_30_value (r->cc);
262 /* Make sure that multiplication by 30 will not overflow. */
263 if (num > DBL_MAX * (1. / 30.))
264 /* The value of the digit doesn't matter, since we have already
265 gotten as many digits as can be represented in a `double'.
266 This doesn't necessarily mean the result will overflow.
267 The exponent may reduce it to within range.
269 We just need to record that there was another
270 digit so that we can multiply by 10 later. */
273 num = (num * 30.0) + digit;
275 /* Keep track of the number of digits after the decimal point.
276 If we just divided by 30 here, we would lose precision. */
280 else if (!got_dot && r->cc == '.')
281 /* Record that we have found the decimal point. */
284 /* Any other character terminates the number. */
290 /* Check that we had some digits. */
292 error (r, _("Number expected."));
294 /* Get exponent if any. */
295 if (r->cc == '+' || r->cc == '-')
298 bool negative_exponent = r->cc == '-';
301 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
303 if (exp > LONG_MAX / 30)
308 exp = exp * 30 + digit;
311 /* We don't check whether there were actually any digits, but we
313 if (negative_exponent)
318 /* Numbers must end with `/'. */
320 error (r, _("Missing numeric terminator."));
322 /* Multiply `num' by 30 to the `exponent' power, checking for
325 num *= pow (30.0, (double) exponent);
326 else if (exponent > 0)
328 if (num > DBL_MAX * pow (30.0, (double) -exponent))
331 num *= pow (30.0, (double) exponent);
334 return negative ? -num : num;
337 /* Read an integer and return its value. */
339 read_int (struct pfm_reader *r)
341 double f = read_float (r);
342 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
343 error (r, _("Invalid integer."));
347 /* Reads a string into BUF, which must have room for 256
350 read_string (struct pfm_reader *r, char *buf)
352 int n = read_int (r);
353 if (n < 0 || n > 255)
354 error (r, _("Bad string length %d."), n);
364 /* Reads a string and returns a copy of it allocated from R's
367 read_pool_string (struct pfm_reader *r)
370 read_string (r, string);
371 return pool_strdup (r->pool, string);
374 /* Reads the 464-byte file header. */
376 read_header (struct pfm_reader *r)
381 /* Read and ignore vanity splash strings. */
382 for (i = 0; i < 200; i++)
385 /* Skip the first 64 characters of the translation table.
386 We don't care about these. They are probably all set to
387 '0', marking them as untranslatable, and that would screw
388 up our actual translation of the real '0'. */
389 for (i = 0; i < 64; i++)
392 /* Read the rest of the translation table. */
393 trans = pool_malloc (r->pool, 256);
394 memset (trans, 0, 256);
403 trans[c] = portable_to_local[i];
406 /* Set up the translation table, then read the first
407 translated character. */
411 /* Skip and verify signature. */
412 for (i = 0; i < 8; i++)
413 if (!match (r, "SPSSPORT"[i]))
415 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
416 longjmp (r->bail_out, 1);
420 /* Reads the version and date info record, as well as product and
421 subproduct identification records if present. */
423 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
425 static char empty_string[] = "";
426 char *date, *time, *product, *author, *subproduct;
431 error (r, _("Unrecognized version code `%c'."), r->cc);
432 date = read_pool_string (r);
433 time = read_pool_string (r);
434 product = match (r, '1') ? read_pool_string (r) : empty_string;
435 author = match (r, '2') ? read_pool_string (r) : empty_string;
436 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
439 if (strlen (date) != 8)
440 error (r, _("Bad date string length %d."), (int) strlen (date));
441 if (strlen (time) != 6)
442 error (r, _("Bad time string length %d."), (int) strlen (time));
444 /* Save file info. */
448 for (i = 0; i < 8; i++)
450 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
451 info->creation_date[map[i]] = date[i];
453 info->creation_date[2] = info->creation_date[5] = ' ';
454 info->creation_date[10] = 0;
457 for (i = 0; i < 6; i++)
459 static const int map[] = {0, 1, 3, 4, 6, 7};
460 info->creation_time[map[i]] = time[i];
462 info->creation_time[2] = info->creation_time[5] = ' ';
463 info->creation_time[8] = 0;
466 str_copy_trunc (info->product, sizeof info->product, product);
467 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
471 /* Translates a format specification read from portable file R as
472 the three integers INTS into a normal format specifier FORMAT,
473 checking that the format is appropriate for variable V. */
474 static struct fmt_spec
475 convert_format (struct pfm_reader *r, const int portable_format[3],
478 struct fmt_spec format;
481 if (!fmt_from_io (portable_format[0], &format.type))
482 error (r, _("%s: Bad format specifier byte (%d)."),
483 var_get_name (v), portable_format[0]);
484 format.w = portable_format[1];
485 format.d = portable_format[2];
488 ok = (fmt_check_output (&format)
489 && fmt_check_width_compat (&format, var_get_width (v)));
494 char fmt_string[FMT_STRING_LEN_MAX + 1];
495 error (r, _("%s variable %s has invalid format specifier %s."),
496 var_is_numeric (v) ? _("Numeric") : _("String"),
497 var_get_name (v), fmt_to_string (&format, fmt_string));
498 format = fmt_default_for_width (var_get_width (v));
504 static union value parse_value (struct pfm_reader *, struct variable *);
506 /* Read information on all the variables. */
508 read_variables (struct pfm_reader *r, struct dictionary *dict)
510 char *weight_name = NULL;
514 error (r, _("Expected variable count record."));
516 r->var_cnt = read_int (r);
517 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
518 error (r, _("Invalid number of variables %d."), r->var_cnt);
519 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
521 /* Purpose of this value is unknown. It is typically 161. */
526 weight_name = read_pool_string (r);
527 if (strlen (weight_name) > SHORT_NAME_LEN)
528 error (r, _("Weight variable name (%s) truncated."), weight_name);
531 for (i = 0; i < r->var_cnt; i++)
537 struct missing_values miss;
538 struct fmt_spec print, write;
542 error (r, _("Expected variable record."));
544 width = read_int (r);
546 error (r, _("Invalid variable width %d."), width);
547 r->widths[i] = width;
549 read_string (r, name);
550 for (j = 0; j < 6; j++)
551 fmt[j] = read_int (r);
553 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
554 error (r, _("position %d: Invalid variable name `%s'."), i, name);
555 str_uppercase (name);
557 if (width < 0 || width > 255)
558 error (r, _("Bad width %d for variable %s."), width, name);
560 v = dict_create_var (dict, name, width);
562 error (r, _("Duplicate variable name %s."), name);
564 print = convert_format (r, &fmt[0], v);
565 write = convert_format (r, &fmt[3], v);
566 var_set_print_format (v, &print);
567 var_set_write_format (v, &write);
569 /* Range missing values. */
570 mv_init (&miss, var_get_width (v));
573 double x = read_float (r);
574 double y = read_float (r);
575 mv_add_num_range (&miss, x, y);
577 else if (match (r, 'A'))
578 mv_add_num_range (&miss, read_float (r), HIGHEST);
579 else if (match (r, '9'))
580 mv_add_num_range (&miss, LOWEST, read_float (r));
582 /* Single missing values. */
583 while (match (r, '8'))
585 union value value = parse_value (r, v);
586 mv_add_value (&miss, &value);
589 var_set_missing_values (v, &miss);
594 read_string (r, label);
595 var_set_label (v, label);
599 if (weight_name != NULL)
601 struct variable *weight_var = dict_lookup_var (dict, weight_name);
602 if (weight_var == NULL)
603 error (r, _("Weighting variable %s not present in dictionary."),
606 dict_set_weight (dict, weight_var);
610 /* Parse a value for variable VV into value V. */
612 parse_value (struct pfm_reader *r, struct variable *vv)
616 if (var_is_alpha (vv))
619 read_string (r, string);
620 buf_copy_str_rpad (v.s, 8, string);
623 v.f = read_float (r);
628 /* Parse a value label record and return success. */
630 read_value_label (struct pfm_reader *r, struct dictionary *dict)
642 v = pool_nalloc (r->pool, nv, sizeof *v);
643 for (i = 0; i < nv; i++)
646 read_string (r, name);
648 v[i] = dict_lookup_var (dict, name);
650 error (r, _("Unknown variable %s while parsing value labels."), name);
652 if (var_get_width (v[0]) != var_get_width (v[i]))
653 error (r, _("Cannot assign value labels to %s and %s, which "
654 "have different variable types or widths."),
655 var_get_name (v[0]), var_get_name (v[i]));
658 n_labels = read_int (r);
659 for (i = 0; i < n_labels; i++)
665 val = parse_value (r, v[0]);
666 read_string (r, label);
668 /* Assign the value_label's to each variable. */
669 for (j = 0; j < nv; j++)
671 struct variable *var = v[j];
673 if (!var_add_value_label (var, &val, label))
676 if (var_is_numeric (var))
677 error (r, _("Duplicate label for value %g for variable %s."),
678 val.f, var_get_name (var));
680 error (r, _("Duplicate label for value `%.*s' for variable %s."),
681 var_get_width (var), val.s, var_get_name (var));
686 /* Reads one case from portable file R into C. */
688 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
690 struct pfm_reader *r = r_;
694 case_create (c, casereader_get_value_cnt (reader));
695 setjmp (r->bail_out);
698 casereader_force_error (reader);
703 /* Check for end of file. */
711 for (i = 0; i < r->var_cnt; i++)
713 int width = r->widths[i];
717 case_data_rw_idx (c, idx)->f = read_float (r);
723 read_string (r, string);
724 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
725 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
732 /* Returns true if FILE is an SPSS portable file,
735 pfm_detect (FILE *file)
737 unsigned char header[464];
739 int cooked_cnt, raw_cnt;
742 cooked_cnt = raw_cnt = 0;
743 while (cooked_cnt < sizeof header)
746 if (c == EOF || raw_cnt++ > 512)
748 else if (c != '\n' && c != '\r')
749 header[cooked_cnt++] = c;
752 memset (trans, 0, 256);
753 for (i = 64; i < 256; i++)
755 unsigned char c = header[i + 200];
757 trans[c] = portable_to_local[i];
760 for (i = 0; i < 8; i++)
761 if (trans[header[i + 456]] != "SPSSPORT"[i])
767 static struct casereader_class por_file_casereader_class =
769 por_file_casereader_read,
770 por_file_casereader_destroy,