1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/format.h>
34 #include <data/missing-values.h>
35 #include <data/value-labels.h>
36 #include <data/variable.h>
37 #include <libpspp/alloc.h>
38 #include <libpspp/compiler.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/magic.h>
41 #include <libpspp/message.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/pool.h>
44 #include <libpspp/str.h>
47 #define _(msgid) gettext (msgid)
49 /* portable_to_local[PORTABLE] translates the given portable
50 character into the local character set. */
51 static const char portable_to_local[256] =
54 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
55 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
59 /* Portable file reader. */
62 struct pool *pool; /* All the portable file state. */
64 jmp_buf bail_out; /* longjmp() target for error handling. */
66 struct file_handle *fh; /* File handle. */
67 FILE *file; /* File stream. */
68 int line_length; /* Number of characters so far on this line. */
69 char cc; /* Current character. */
70 char *trans; /* 256-byte character set translation table. */
71 int var_cnt; /* Number of variables. */
72 int weight_index; /* 0-based index of weight variable, or -1. */
73 int *widths; /* Variable widths, 0 for numeric. */
74 size_t value_cnt; /* Number of `value's per case. */
75 bool ok; /* Set false on I/O error. */
78 static struct casereader_class por_file_casereader_class;
81 error (struct pfm_reader *r, const char *msg,...)
85 /* Displays MSG as an error message and aborts reading the
86 portable file via longjmp(). */
88 error (struct pfm_reader *r, const char *msg, ...)
94 ds_init_empty (&text);
95 ds_put_format (&text, _("portable file %s corrupt at offset 0x%lx: "),
96 fh_get_file_name (r->fh), ftell (r->file));
98 ds_put_vformat (&text, msg, args);
101 m.category = MSG_GENERAL;
102 m.severity = MSG_ERROR;
103 m.where.file_name = NULL;
104 m.where.line_number = 0;
105 m.text = ds_cstr (&text);
111 longjmp (r->bail_out, 1);
114 /* Displays MSG as an warning for the current position in
115 portable file reader R. */
117 warning (struct pfm_reader *r, const char *msg, ...)
123 ds_init_empty (&text);
124 ds_put_format (&text, _("reading portable file %s at offset 0x%lx: "),
125 fh_get_file_name (r->fh), ftell (r->file));
126 va_start (args, msg);
127 ds_put_vformat (&text, msg, args);
130 m.category = MSG_GENERAL;
131 m.severity = MSG_WARNING;
132 m.where.file_name = NULL;
133 m.where.line_number = 0;
134 m.text = ds_cstr (&text);
139 /* Closes portable file reader R, after we're done with it. */
141 por_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
143 struct pfm_reader *r = r_;
144 pool_destroy (r->pool);
147 /* Read a single character into cur_char. */
149 advance (struct pfm_reader *r)
153 /* Read the next character from the file.
154 Ignore carriage returns entirely.
155 Mostly ignore new-lines, but if a new-line occurs before the
156 line has reached 80 bytes in length, then treat the
157 "missing" bytes as spaces. */
160 while ((c = getc (r->file)) == '\r')
165 if (r->line_length < 80)
168 ungetc ('\n', r->file);
174 error (r, _("unexpected end of file"));
176 if (r->trans != NULL)
182 /* Skip a single character if present, and return whether it was
185 match (struct pfm_reader *r, int c)
196 static void read_header (struct pfm_reader *);
197 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
198 static void read_variables (struct pfm_reader *, struct dictionary *);
199 static void read_value_label (struct pfm_reader *, struct dictionary *);
200 static void read_documents (struct pfm_reader *, struct dictionary *);
202 /* Reads the dictionary from file with handle H, and returns it in a
203 dictionary structure. This dictionary may be modified in order to
204 rename, reorder, and delete variables, etc. */
206 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
207 struct pfm_read_info *info)
209 struct pool *volatile pool = NULL;
210 struct pfm_reader *volatile r = NULL;
212 *dict = dict_create ();
213 if (!fh_open (fh, FH_REF_FILE, "portable file", "rs"))
216 /* Create and initialize reader. */
217 pool = pool_create ();
218 r = pool_alloc (pool, sizeof *r);
220 if (setjmp (r->bail_out))
223 r->file = pool_fopen (r->pool, fh_get_file_name (r->fh), "rb");
225 r->weight_index = -1;
232 /* Check that file open succeeded, prime reading. */
235 msg (ME, _("An error occurred while opening \"%s\" for reading "
236 "as a portable file: %s."),
237 fh_get_file_name (r->fh), strerror (errno));
241 /* Read header, version, date info, product id, variables. */
243 read_version_data (r, info);
244 read_variables (r, *dict);
246 /* Read value labels. */
247 while (match (r, 'D'))
248 read_value_label (r, *dict);
250 /* Read documents. */
252 read_documents (r, *dict);
254 /* Check that we've made it to the data. */
256 error (r, _("Data record expected."));
258 r->value_cnt = dict_get_next_value_idx (*dict);
259 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
260 &por_file_casereader_class, r);
263 pool_destroy (r->pool);
264 dict_destroy (*dict);
269 /* Returns the value of base-30 digit C,
270 or -1 if C is not a base-30 digit. */
272 base_30_value (unsigned char c)
274 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
275 const char *p = strchr (base_30_digits, c);
276 return p != NULL ? p - base_30_digits : -1;
279 /* Read a floating point value and return its value. */
281 read_float (struct pfm_reader *r)
285 bool got_dot = false; /* Seen a decimal point? */
286 bool got_digit = false; /* Seen any digits? */
287 bool negative = false; /* Number is negative? */
289 /* Skip leading spaces. */
290 while (match (r, ' '))
293 /* `*' indicates system-missing. */
296 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
300 negative = match (r, '-');
303 int digit = base_30_value (r->cc);
308 /* Make sure that multiplication by 30 will not overflow. */
309 if (num > DBL_MAX * (1. / 30.))
310 /* The value of the digit doesn't matter, since we have already
311 gotten as many digits as can be represented in a `double'.
312 This doesn't necessarily mean the result will overflow.
313 The exponent may reduce it to within range.
315 We just need to record that there was another
316 digit so that we can multiply by 10 later. */
319 num = (num * 30.0) + digit;
321 /* Keep track of the number of digits after the decimal point.
322 If we just divided by 30 here, we would lose precision. */
326 else if (!got_dot && r->cc == '.')
327 /* Record that we have found the decimal point. */
330 /* Any other character terminates the number. */
336 /* Check that we had some digits. */
338 error (r, _("Number expected."));
340 /* Get exponent if any. */
341 if (r->cc == '+' || r->cc == '-')
344 bool negative_exponent = r->cc == '-';
347 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
349 if (exp > LONG_MAX / 30)
354 exp = exp * 30 + digit;
357 /* We don't check whether there were actually any digits, but we
359 if (negative_exponent)
364 /* Numbers must end with `/'. */
366 error (r, _("Missing numeric terminator."));
368 /* Multiply `num' by 30 to the `exponent' power, checking for
371 num *= pow (30.0, (double) exponent);
372 else if (exponent > 0)
374 if (num > DBL_MAX * pow (30.0, (double) -exponent))
377 num *= pow (30.0, (double) exponent);
380 return negative ? -num : num;
383 /* Read an integer and return its value. */
385 read_int (struct pfm_reader *r)
387 double f = read_float (r);
388 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
389 error (r, _("Invalid integer."));
393 /* Reads a string into BUF, which must have room for 256
396 read_string (struct pfm_reader *r, char *buf)
398 int n = read_int (r);
399 if (n < 0 || n > 255)
400 error (r, _("Bad string length %d."), n);
410 /* Reads a string and returns a copy of it allocated from R's
413 read_pool_string (struct pfm_reader *r)
416 read_string (r, string);
417 return pool_strdup (r->pool, string);
420 /* Reads the 464-byte file header. */
422 read_header (struct pfm_reader *r)
427 /* Read and ignore vanity splash strings. */
428 for (i = 0; i < 200; i++)
431 /* Skip the first 64 characters of the translation table.
432 We don't care about these. They are probably all set to
433 '0', marking them as untranslatable, and that would screw
434 up our actual translation of the real '0'. */
435 for (i = 0; i < 64; i++)
438 /* Read the rest of the translation table. */
439 trans = pool_malloc (r->pool, 256);
440 memset (trans, 0, 256);
449 trans[c] = portable_to_local[i];
452 /* Set up the translation table, then read the first
453 translated character. */
457 /* Skip and verify signature. */
458 for (i = 0; i < 8; i++)
459 if (!match (r, "SPSSPORT"[i]))
461 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
462 longjmp (r->bail_out, 1);
466 /* Reads the version and date info record, as well as product and
467 subproduct identification records if present. */
469 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
471 static char empty_string[] = "";
472 char *date, *time, *product, *author, *subproduct;
477 error (r, _("Unrecognized version code `%c'."), r->cc);
478 date = read_pool_string (r);
479 time = read_pool_string (r);
480 product = match (r, '1') ? read_pool_string (r) : empty_string;
481 author = match (r, '2') ? read_pool_string (r) : empty_string;
482 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
485 if (strlen (date) != 8)
486 error (r, _("Bad date string length %d."), (int) strlen (date));
487 if (strlen (time) != 6)
488 error (r, _("Bad time string length %d."), (int) strlen (time));
490 /* Save file info. */
494 for (i = 0; i < 8; i++)
496 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
497 info->creation_date[map[i]] = date[i];
499 info->creation_date[2] = info->creation_date[5] = ' ';
500 info->creation_date[10] = 0;
503 for (i = 0; i < 6; i++)
505 static const int map[] = {0, 1, 3, 4, 6, 7};
506 info->creation_time[map[i]] = time[i];
508 info->creation_time[2] = info->creation_time[5] = ' ';
509 info->creation_time[8] = 0;
512 str_copy_trunc (info->product, sizeof info->product, product);
513 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
517 /* Translates a format specification read from portable file R as
518 the three integers INTS into a normal format specifier FORMAT,
519 checking that the format is appropriate for variable V. */
520 static struct fmt_spec
521 convert_format (struct pfm_reader *r, const int portable_format[3],
522 struct variable *v, bool *report_error)
524 struct fmt_spec format;
527 if (!fmt_from_io (portable_format[0], &format.type))
530 warning (r, _("%s: Bad format specifier byte (%d). Variable "
531 "will be assigned a default format."),
532 var_get_name (v), portable_format[0]);
536 format.w = portable_format[1];
537 format.d = portable_format[2];
540 ok = (fmt_check_output (&format)
541 && fmt_check_width_compat (&format, var_get_width (v)));
548 char fmt_string[FMT_STRING_LEN_MAX + 1];
549 fmt_to_string (&format, fmt_string);
550 if (var_is_numeric (v))
551 warning (r, _("Numeric variable %s has invalid format "
553 var_get_name (v), fmt_string);
555 warning (r, _("String variable %s with width %d has "
556 "invalid format specifier %s."),
557 var_get_name (v), var_get_width (v), fmt_string);
565 *report_error = false;
566 return fmt_default_for_width (var_get_width (v));
569 static union value parse_value (struct pfm_reader *, struct variable *);
571 /* Read information on all the variables. */
573 read_variables (struct pfm_reader *r, struct dictionary *dict)
575 char *weight_name = NULL;
579 error (r, _("Expected variable count record."));
581 r->var_cnt = read_int (r);
582 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
583 error (r, _("Invalid number of variables %d."), r->var_cnt);
584 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
586 /* Purpose of this value is unknown. It is typically 161. */
591 weight_name = read_pool_string (r);
592 if (strlen (weight_name) > SHORT_NAME_LEN)
593 error (r, _("Weight variable name (%s) truncated."), weight_name);
596 for (i = 0; i < r->var_cnt; i++)
602 struct missing_values miss;
603 struct fmt_spec print, write;
604 bool report_error = true;
608 error (r, _("Expected variable record."));
610 width = read_int (r);
612 error (r, _("Invalid variable width %d."), width);
613 r->widths[i] = width;
615 read_string (r, name);
616 for (j = 0; j < 6; j++)
617 fmt[j] = read_int (r);
619 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
620 error (r, _("Invalid variable name `%s' in position %d."), name, i);
621 str_uppercase (name);
623 if (width < 0 || width > 255)
624 error (r, _("Bad width %d for variable %s."), width, name);
626 v = dict_create_var (dict, name, width);
630 for (i = 1; i < 100000; i++)
632 char try_name[LONG_NAME_LEN + 1];
633 sprintf (try_name, "%.*s_%d", LONG_NAME_LEN - 6, name, i);
634 v = dict_create_var (dict, try_name, width);
639 error (r, _("Duplicate variable name %s in position %d."), name, i);
640 warning (r, _("Duplicate variable name %s in position %d renamed "
641 "to %s."), name, i, var_get_name (v));
644 print = convert_format (r, &fmt[0], v, &report_error);
645 write = convert_format (r, &fmt[3], v, &report_error);
646 var_set_print_format (v, &print);
647 var_set_write_format (v, &write);
649 /* Range missing values. */
650 mv_init (&miss, var_get_width (v));
653 double x = read_float (r);
654 double y = read_float (r);
655 mv_add_num_range (&miss, x, y);
657 else if (match (r, 'A'))
658 mv_add_num_range (&miss, read_float (r), HIGHEST);
659 else if (match (r, '9'))
660 mv_add_num_range (&miss, LOWEST, read_float (r));
662 /* Single missing values. */
663 while (match (r, '8'))
665 union value value = parse_value (r, v);
666 mv_add_value (&miss, &value);
669 var_set_missing_values (v, &miss);
674 read_string (r, label);
675 var_set_label (v, label);
679 if (weight_name != NULL)
681 struct variable *weight_var = dict_lookup_var (dict, weight_name);
682 if (weight_var == NULL)
683 error (r, _("Weighting variable %s not present in dictionary."),
686 dict_set_weight (dict, weight_var);
690 /* Parse a value for variable VV into value V. */
692 parse_value (struct pfm_reader *r, struct variable *vv)
696 if (var_is_alpha (vv))
699 read_string (r, string);
700 buf_copy_str_rpad (v.s, 8, string);
703 v.f = read_float (r);
708 /* Parse a value label record and return success. */
710 read_value_label (struct pfm_reader *r, struct dictionary *dict)
722 v = pool_nalloc (r->pool, nv, sizeof *v);
723 for (i = 0; i < nv; i++)
726 read_string (r, name);
728 v[i] = dict_lookup_var (dict, name);
730 error (r, _("Unknown variable %s while parsing value labels."), name);
732 if (var_get_type (v[0]) != var_get_type (v[i]))
733 error (r, _("Cannot assign value labels to %s and %s, which "
734 "have different variable types."),
735 var_get_name (v[0]), var_get_name (v[i]));
738 n_labels = read_int (r);
739 for (i = 0; i < n_labels; i++)
745 val = parse_value (r, v[0]);
746 read_string (r, label);
748 /* Assign the value label to each variable. */
749 for (j = 0; j < nv; j++)
751 struct variable *var = v[j];
753 if (!var_is_long_string (var))
754 var_replace_value_label (var, &val, label);
759 /* Reads a set of documents from portable file R into DICT. */
761 read_documents (struct pfm_reader *r, struct dictionary *dict)
766 line_cnt = read_int (r);
767 for (i = 0; i < line_cnt; i++)
770 read_string (r, line);
771 dict_add_document_line (dict, line);
775 /* Reads one case from portable file R into C. */
777 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
779 struct pfm_reader *r = r_;
783 case_create (c, casereader_get_value_cnt (reader));
784 setjmp (r->bail_out);
787 casereader_force_error (reader);
792 /* Check for end of file. */
800 for (i = 0; i < r->var_cnt; i++)
802 int width = r->widths[i];
806 case_data_rw_idx (c, idx)->f = read_float (r);
812 read_string (r, string);
813 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
814 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
821 /* Returns true if FILE is an SPSS portable file,
824 pfm_detect (FILE *file)
826 unsigned char header[464];
828 int cooked_cnt, raw_cnt;
831 cooked_cnt = raw_cnt = 0;
832 while (cooked_cnt < sizeof header)
835 if (c == EOF || raw_cnt++ > 512)
837 else if (c != '\n' && c != '\r')
838 header[cooked_cnt++] = c;
841 memset (trans, 0, 256);
842 for (i = 64; i < 256; i++)
844 unsigned char c = header[i + 200];
846 trans[c] = portable_to_local[i];
849 for (i = 0; i < 8; i++)
850 if (trans[header[i + 456]] != "SPSSPORT"[i])
856 static struct casereader_class por_file_casereader_class =
858 por_file_casereader_read,
859 por_file_casereader_destroy,