1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/format.h>
34 #include <data/missing-values.h>
35 #include <data/value-labels.h>
36 #include <data/variable.h>
37 #include <libpspp/alloc.h>
38 #include <libpspp/compiler.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/magic.h>
41 #include <libpspp/message.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/pool.h>
44 #include <libpspp/str.h>
47 #define _(msgid) gettext (msgid)
49 /* portable_to_local[PORTABLE] translates the given portable
50 character into the local character set. */
51 static const char portable_to_local[256] =
54 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
55 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
59 /* Portable file reader. */
62 struct pool *pool; /* All the portable file state. */
64 jmp_buf bail_out; /* longjmp() target for error handling. */
66 struct file_handle *fh; /* File handle. */
67 FILE *file; /* File stream. */
68 char cc; /* Current character. */
69 char *trans; /* 256-byte character set translation table. */
70 int var_cnt; /* Number of variables. */
71 int weight_index; /* 0-based index of weight variable, or -1. */
72 int *widths; /* Variable widths, 0 for numeric. */
73 size_t value_cnt; /* Number of `value's per case. */
74 bool ok; /* Set false on I/O error. */
77 static struct casereader_class por_file_casereader_class;
80 error (struct pfm_reader *r, const char *msg,...)
84 /* Displays MSG as an error message and aborts reading the
85 portable file via longjmp(). */
87 error (struct pfm_reader *r, const char *msg, ...)
93 ds_init_empty (&text);
94 ds_put_format (&text, _("portable file %s corrupt at offset %ld: "),
95 fh_get_file_name (r->fh), ftell (r->file));
97 ds_put_vformat (&text, msg, args);
100 m.category = MSG_GENERAL;
101 m.severity = MSG_ERROR;
102 m.where.file_name = NULL;
103 m.where.line_number = 0;
104 m.text = ds_cstr (&text);
110 longjmp (r->bail_out, 1);
113 /* Closes portable file reader R, after we're done with it. */
115 por_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
117 struct pfm_reader *r = r_;
118 pool_destroy (r->pool);
121 /* Read a single character into cur_char. */
123 advance (struct pfm_reader *r)
127 while ((c = getc (r->file)) == '\r' || c == '\n')
130 error (r, _("unexpected end of file"));
132 if (r->trans != NULL)
137 /* Skip a single character if present, and return whether it was
140 match (struct pfm_reader *r, int c)
151 static void read_header (struct pfm_reader *);
152 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
153 static void read_variables (struct pfm_reader *, struct dictionary *);
154 static void read_value_label (struct pfm_reader *, struct dictionary *);
155 void dump_dictionary (struct dictionary *);
157 /* Reads the dictionary from file with handle H, and returns it in a
158 dictionary structure. This dictionary may be modified in order to
159 rename, reorder, and delete variables, etc. */
161 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
162 struct pfm_read_info *info)
164 struct pool *volatile pool = NULL;
165 struct pfm_reader *volatile r = NULL;
167 *dict = dict_create ();
168 if (!fh_open (fh, FH_REF_FILE, "portable file", "rs"))
171 /* Create and initialize reader. */
172 pool = pool_create ();
173 r = pool_alloc (pool, sizeof *r);
175 if (setjmp (r->bail_out))
178 r->file = pool_fopen (r->pool, fh_get_file_name (r->fh), "rb");
179 r->weight_index = -1;
186 /* Check that file open succeeded, prime reading. */
189 msg (ME, _("An error occurred while opening \"%s\" for reading "
190 "as a portable file: %s."),
191 fh_get_file_name (r->fh), strerror (errno));
195 /* Read header, version, date info, product id, variables. */
197 read_version_data (r, info);
198 read_variables (r, *dict);
200 /* Read value labels. */
201 while (match (r, 'D'))
202 read_value_label (r, *dict);
204 /* Check that we've made it to the data. */
206 error (r, _("Data record expected."));
208 r->value_cnt = dict_get_next_value_idx (*dict);
209 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
210 &por_file_casereader_class, r);
213 pool_destroy (r->pool);
214 dict_destroy (*dict);
219 /* Returns the value of base-30 digit C,
220 or -1 if C is not a base-30 digit. */
222 base_30_value (unsigned char c)
224 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
225 const char *p = strchr (base_30_digits, c);
226 return p != NULL ? p - base_30_digits : -1;
229 /* Read a floating point value and return its value. */
231 read_float (struct pfm_reader *r)
235 bool got_dot = false; /* Seen a decimal point? */
236 bool got_digit = false; /* Seen any digits? */
237 bool negative = false; /* Number is negative? */
239 /* Skip leading spaces. */
240 while (match (r, ' '))
243 /* `*' indicates system-missing. */
246 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
250 negative = match (r, '-');
253 int digit = base_30_value (r->cc);
258 /* Make sure that multiplication by 30 will not overflow. */
259 if (num > DBL_MAX * (1. / 30.))
260 /* The value of the digit doesn't matter, since we have already
261 gotten as many digits as can be represented in a `double'.
262 This doesn't necessarily mean the result will overflow.
263 The exponent may reduce it to within range.
265 We just need to record that there was another
266 digit so that we can multiply by 10 later. */
269 num = (num * 30.0) + digit;
271 /* Keep track of the number of digits after the decimal point.
272 If we just divided by 30 here, we would lose precision. */
276 else if (!got_dot && r->cc == '.')
277 /* Record that we have found the decimal point. */
280 /* Any other character terminates the number. */
286 /* Check that we had some digits. */
288 error (r, _("Number expected."));
290 /* Get exponent if any. */
291 if (r->cc == '+' || r->cc == '-')
294 bool negative_exponent = r->cc == '-';
297 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
299 if (exp > LONG_MAX / 30)
304 exp = exp * 30 + digit;
307 /* We don't check whether there were actually any digits, but we
309 if (negative_exponent)
314 /* Numbers must end with `/'. */
316 error (r, _("Missing numeric terminator."));
318 /* Multiply `num' by 30 to the `exponent' power, checking for
321 num *= pow (30.0, (double) exponent);
322 else if (exponent > 0)
324 if (num > DBL_MAX * pow (30.0, (double) -exponent))
327 num *= pow (30.0, (double) exponent);
330 return negative ? -num : num;
333 /* Read an integer and return its value. */
335 read_int (struct pfm_reader *r)
337 double f = read_float (r);
338 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
339 error (r, _("Invalid integer."));
343 /* Reads a string into BUF, which must have room for 256
346 read_string (struct pfm_reader *r, char *buf)
348 int n = read_int (r);
349 if (n < 0 || n > 255)
350 error (r, _("Bad string length %d."), n);
360 /* Reads a string and returns a copy of it allocated from R's
363 read_pool_string (struct pfm_reader *r)
366 read_string (r, string);
367 return pool_strdup (r->pool, string);
370 /* Reads the 464-byte file header. */
372 read_header (struct pfm_reader *r)
377 /* Read and ignore vanity splash strings. */
378 for (i = 0; i < 200; i++)
381 /* Skip the first 64 characters of the translation table.
382 We don't care about these. They are probably all set to
383 '0', marking them as untranslatable, and that would screw
384 up our actual translation of the real '0'. */
385 for (i = 0; i < 64; i++)
388 /* Read the rest of the translation table. */
389 trans = pool_malloc (r->pool, 256);
390 memset (trans, 0, 256);
399 trans[c] = portable_to_local[i];
402 /* Set up the translation table, then read the first
403 translated character. */
407 /* Skip and verify signature. */
408 for (i = 0; i < 8; i++)
409 if (!match (r, "SPSSPORT"[i]))
411 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
412 longjmp (r->bail_out, 1);
416 /* Reads the version and date info record, as well as product and
417 subproduct identification records if present. */
419 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
421 static char empty_string[] = "";
422 char *date, *time, *product, *author, *subproduct;
427 error (r, _("Unrecognized version code `%c'."), r->cc);
428 date = read_pool_string (r);
429 time = read_pool_string (r);
430 product = match (r, '1') ? read_pool_string (r) : empty_string;
431 author = match (r, '2') ? read_pool_string (r) : empty_string;
432 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
435 if (strlen (date) != 8)
436 error (r, _("Bad date string length %d."), (int) strlen (date));
437 if (strlen (time) != 6)
438 error (r, _("Bad time string length %d."), (int) strlen (time));
440 /* Save file info. */
444 for (i = 0; i < 8; i++)
446 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
447 info->creation_date[map[i]] = date[i];
449 info->creation_date[2] = info->creation_date[5] = ' ';
450 info->creation_date[10] = 0;
453 for (i = 0; i < 6; i++)
455 static const int map[] = {0, 1, 3, 4, 6, 7};
456 info->creation_time[map[i]] = time[i];
458 info->creation_time[2] = info->creation_time[5] = ' ';
459 info->creation_time[8] = 0;
462 str_copy_trunc (info->product, sizeof info->product, product);
463 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
467 /* Translates a format specification read from portable file R as
468 the three integers INTS into a normal format specifier FORMAT,
469 checking that the format is appropriate for variable V. */
470 static struct fmt_spec
471 convert_format (struct pfm_reader *r, const int portable_format[3],
474 struct fmt_spec format;
477 if (!fmt_from_io (portable_format[0], &format.type))
478 error (r, _("%s: Bad format specifier byte (%d)."),
479 var_get_name (v), portable_format[0]);
480 format.w = portable_format[1];
481 format.d = portable_format[2];
484 ok = (fmt_check_output (&format)
485 && fmt_check_width_compat (&format, var_get_width (v)));
490 char fmt_string[FMT_STRING_LEN_MAX + 1];
491 error (r, _("%s variable %s has invalid format specifier %s."),
492 var_is_numeric (v) ? _("Numeric") : _("String"),
493 var_get_name (v), fmt_to_string (&format, fmt_string));
494 format = fmt_default_for_width (var_get_width (v));
500 static union value parse_value (struct pfm_reader *, struct variable *);
502 /* Read information on all the variables. */
504 read_variables (struct pfm_reader *r, struct dictionary *dict)
506 char *weight_name = NULL;
510 error (r, _("Expected variable count record."));
512 r->var_cnt = read_int (r);
513 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
514 error (r, _("Invalid number of variables %d."), r->var_cnt);
515 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
517 /* Purpose of this value is unknown. It is typically 161. */
522 weight_name = read_pool_string (r);
523 if (strlen (weight_name) > SHORT_NAME_LEN)
524 error (r, _("Weight variable name (%s) truncated."), weight_name);
527 for (i = 0; i < r->var_cnt; i++)
533 struct missing_values miss;
534 struct fmt_spec print, write;
538 error (r, _("Expected variable record."));
540 width = read_int (r);
542 error (r, _("Invalid variable width %d."), width);
543 r->widths[i] = width;
545 read_string (r, name);
546 for (j = 0; j < 6; j++)
547 fmt[j] = read_int (r);
549 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
550 error (r, _("position %d: Invalid variable name `%s'."), i, name);
551 str_uppercase (name);
553 if (width < 0 || width > 255)
554 error (r, _("Bad width %d for variable %s."), width, name);
556 v = dict_create_var (dict, name, width);
558 error (r, _("Duplicate variable name %s."), name);
560 print = convert_format (r, &fmt[0], v);
561 write = convert_format (r, &fmt[3], v);
562 var_set_print_format (v, &print);
563 var_set_write_format (v, &write);
565 /* Range missing values. */
566 mv_init (&miss, var_get_width (v));
569 double x = read_float (r);
570 double y = read_float (r);
571 mv_add_num_range (&miss, x, y);
573 else if (match (r, 'A'))
574 mv_add_num_range (&miss, read_float (r), HIGHEST);
575 else if (match (r, '9'))
576 mv_add_num_range (&miss, LOWEST, read_float (r));
578 /* Single missing values. */
579 while (match (r, '8'))
581 union value value = parse_value (r, v);
582 mv_add_value (&miss, &value);
585 var_set_missing_values (v, &miss);
590 read_string (r, label);
591 var_set_label (v, label);
595 if (weight_name != NULL)
597 struct variable *weight_var = dict_lookup_var (dict, weight_name);
598 if (weight_var == NULL)
599 error (r, _("Weighting variable %s not present in dictionary."),
602 dict_set_weight (dict, weight_var);
606 /* Parse a value for variable VV into value V. */
608 parse_value (struct pfm_reader *r, struct variable *vv)
612 if (var_is_alpha (vv))
615 read_string (r, string);
616 buf_copy_str_rpad (v.s, 8, string);
619 v.f = read_float (r);
624 /* Parse a value label record and return success. */
626 read_value_label (struct pfm_reader *r, struct dictionary *dict)
638 v = pool_nalloc (r->pool, nv, sizeof *v);
639 for (i = 0; i < nv; i++)
642 read_string (r, name);
644 v[i] = dict_lookup_var (dict, name);
646 error (r, _("Unknown variable %s while parsing value labels."), name);
648 if (var_get_width (v[0]) != var_get_width (v[i]))
649 error (r, _("Cannot assign value labels to %s and %s, which "
650 "have different variable types or widths."),
651 var_get_name (v[0]), var_get_name (v[i]));
654 n_labels = read_int (r);
655 for (i = 0; i < n_labels; i++)
661 val = parse_value (r, v[0]);
662 read_string (r, label);
664 /* Assign the value_label's to each variable. */
665 for (j = 0; j < nv; j++)
667 struct variable *var = v[j];
669 if (!var_add_value_label (var, &val, label))
672 if (var_is_numeric (var))
673 error (r, _("Duplicate label for value %g for variable %s."),
674 val.f, var_get_name (var));
676 error (r, _("Duplicate label for value `%.*s' for variable %s."),
677 var_get_width (var), val.s, var_get_name (var));
682 /* Reads one case from portable file R into C. */
684 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
686 struct pfm_reader *r = r_;
690 case_create (c, casereader_get_value_cnt (reader));
691 setjmp (r->bail_out);
694 casereader_force_error (reader);
699 /* Check for end of file. */
707 for (i = 0; i < r->var_cnt; i++)
709 int width = r->widths[i];
713 case_data_rw_idx (c, idx)->f = read_float (r);
719 read_string (r, string);
720 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
721 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
728 /* Returns true if FILE is an SPSS portable file,
731 pfm_detect (FILE *file)
733 unsigned char header[464];
735 int cooked_cnt, raw_cnt;
738 cooked_cnt = raw_cnt = 0;
739 while (cooked_cnt < sizeof header)
742 if (c == EOF || raw_cnt++ > 512)
744 else if (c != '\n' && c != '\r')
745 header[cooked_cnt++] = c;
748 memset (trans, 0, 256);
749 for (i = 64; i < 256; i++)
751 unsigned char c = header[i + 200];
753 trans[c] = portable_to_local[i];
756 for (i = 0; i < 8; i++)
757 if (trans[header[i + 456]] != "SPSSPORT"[i])
763 static struct casereader_class por_file_casereader_class =
765 por_file_casereader_read,
766 por_file_casereader_destroy,