1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <libpspp/compiler.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/message.h>
41 #include <libpspp/misc.h>
42 #include <libpspp/pool.h>
43 #include <libpspp/str.h>
48 #define _(msgid) gettext (msgid)
49 #define N_(msgid) (msgid)
51 /* portable_to_local[PORTABLE] translates the given portable
52 character into the local character set. */
53 static const char portable_to_local[256] =
56 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
57 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
61 /* Portable file reader. */
64 struct pool *pool; /* All the portable file state. */
66 jmp_buf bail_out; /* longjmp() target for error handling. */
68 struct file_handle *fh; /* File handle. */
69 struct fh_lock *lock; /* Read lock for file. */
70 FILE *file; /* File stream. */
71 int line_length; /* Number of characters so far on this line. */
72 char cc; /* Current character. */
73 char *trans; /* 256-byte character set translation table. */
74 int var_cnt; /* Number of variables. */
75 int weight_index; /* 0-based index of weight variable, or -1. */
76 int *widths; /* Variable widths, 0 for numeric. */
77 size_t value_cnt; /* Number of `value's per case. */
78 bool ok; /* Set false on I/O error. */
81 static struct casereader_class por_file_casereader_class;
84 error (struct pfm_reader *r, const char *msg,...)
88 /* Displays MSG as an error message and aborts reading the
89 portable file via longjmp(). */
91 error (struct pfm_reader *r, const char *msg, ...)
97 ds_init_empty (&text);
98 ds_put_format (&text, _("portable file %s corrupt at offset 0x%lx: "),
99 fh_get_file_name (r->fh), ftell (r->file));
100 va_start (args, msg);
101 ds_put_vformat (&text, msg, args);
104 m.category = MSG_GENERAL;
105 m.severity = MSG_ERROR;
106 m.where.file_name = NULL;
107 m.where.line_number = 0;
108 m.text = ds_cstr (&text);
114 longjmp (r->bail_out, 1);
117 /* Displays MSG as an warning for the current position in
118 portable file reader R. */
120 warning (struct pfm_reader *r, const char *msg, ...)
126 ds_init_empty (&text);
127 ds_put_format (&text, _("reading portable file %s at offset 0x%lx: "),
128 fh_get_file_name (r->fh), ftell (r->file));
129 va_start (args, msg);
130 ds_put_vformat (&text, msg, args);
133 m.category = MSG_GENERAL;
134 m.severity = MSG_WARNING;
135 m.where.file_name = NULL;
136 m.where.line_number = 0;
137 m.text = ds_cstr (&text);
142 /* Close and destroy R.
143 Returns false if an error was detected on R, true otherwise. */
145 close_reader (struct pfm_reader *r)
153 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
155 msg (ME, _("Error closing portable file \"%s\": %s."),
156 fh_get_file_name (r->fh), strerror (errno));
166 pool_destroy (r->pool);
171 /* Closes portable file reader R, after we're done with it. */
173 por_file_casereader_destroy (struct casereader *reader, void *r_)
175 struct pfm_reader *r = r_;
176 if (!close_reader (r))
177 casereader_force_error (reader);
180 /* Read a single character into cur_char. */
182 advance (struct pfm_reader *r)
186 /* Read the next character from the file.
187 Ignore carriage returns entirely.
188 Mostly ignore new-lines, but if a new-line occurs before the
189 line has reached 80 bytes in length, then treat the
190 "missing" bytes as spaces. */
193 while ((c = getc (r->file)) == '\r')
198 if (r->line_length < 80)
201 ungetc ('\n', r->file);
207 error (r, _("unexpected end of file"));
209 if (r->trans != NULL)
215 /* Skip a single character if present, and return whether it was
218 match (struct pfm_reader *r, int c)
229 static void read_header (struct pfm_reader *);
230 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
231 static void read_variables (struct pfm_reader *, struct dictionary *);
232 static void read_value_label (struct pfm_reader *, struct dictionary *);
233 static void read_documents (struct pfm_reader *, struct dictionary *);
235 /* Reads the dictionary from file with handle H, and returns it in a
236 dictionary structure. This dictionary may be modified in order to
237 rename, reorder, and delete variables, etc. */
239 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
240 struct pfm_read_info *info)
242 struct pool *volatile pool = NULL;
243 struct pfm_reader *volatile r = NULL;
245 *dict = dict_create ();
247 /* Create and initialize reader. */
248 pool = pool_create ();
249 r = pool_alloc (pool, sizeof *r);
255 r->weight_index = -1;
261 if (setjmp (r->bail_out))
265 /* TRANSLATORS: this fragment will be interpolated into
266 messages in fh_lock() that identify types of files. */
267 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
272 r->file = fn_open (fh_get_file_name (r->fh), "rb");
275 msg (ME, _("An error occurred while opening \"%s\" for reading "
276 "as a portable file: %s."),
277 fh_get_file_name (r->fh), strerror (errno));
281 /* Read header, version, date info, product id, variables. */
283 read_version_data (r, info);
284 read_variables (r, *dict);
286 /* Read value labels. */
287 while (match (r, 'D'))
288 read_value_label (r, *dict);
290 /* Read documents. */
292 read_documents (r, *dict);
294 /* Check that we've made it to the data. */
296 error (r, _("Data record expected."));
298 r->value_cnt = dict_get_next_value_idx (*dict);
299 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
300 &por_file_casereader_class, r);
304 dict_destroy (*dict);
309 /* Returns the value of base-30 digit C,
310 or -1 if C is not a base-30 digit. */
312 base_30_value (unsigned char c)
314 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
315 const char *p = strchr (base_30_digits, c);
316 return p != NULL ? p - base_30_digits : -1;
319 /* Read a floating point value and return its value. */
321 read_float (struct pfm_reader *r)
325 bool got_dot = false; /* Seen a decimal point? */
326 bool got_digit = false; /* Seen any digits? */
327 bool negative = false; /* Number is negative? */
329 /* Skip leading spaces. */
330 while (match (r, ' '))
333 /* `*' indicates system-missing. */
336 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
340 negative = match (r, '-');
343 int digit = base_30_value (r->cc);
348 /* Make sure that multiplication by 30 will not overflow. */
349 if (num > DBL_MAX * (1. / 30.))
350 /* The value of the digit doesn't matter, since we have already
351 gotten as many digits as can be represented in a `double'.
352 This doesn't necessarily mean the result will overflow.
353 The exponent may reduce it to within range.
355 We just need to record that there was another
356 digit so that we can multiply by 10 later. */
359 num = (num * 30.0) + digit;
361 /* Keep track of the number of digits after the decimal point.
362 If we just divided by 30 here, we would lose precision. */
366 else if (!got_dot && r->cc == '.')
367 /* Record that we have found the decimal point. */
370 /* Any other character terminates the number. */
376 /* Check that we had some digits. */
378 error (r, _("Number expected."));
380 /* Get exponent if any. */
381 if (r->cc == '+' || r->cc == '-')
384 bool negative_exponent = r->cc == '-';
387 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
389 if (exp > LONG_MAX / 30)
394 exp = exp * 30 + digit;
397 /* We don't check whether there were actually any digits, but we
399 if (negative_exponent)
404 /* Numbers must end with `/'. */
406 error (r, _("Missing numeric terminator."));
408 /* Multiply `num' by 30 to the `exponent' power, checking for
411 num *= pow (30.0, (double) exponent);
412 else if (exponent > 0)
414 if (num > DBL_MAX * pow (30.0, (double) -exponent))
417 num *= pow (30.0, (double) exponent);
420 return negative ? -num : num;
423 /* Read an integer and return its value. */
425 read_int (struct pfm_reader *r)
427 double f = read_float (r);
428 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
429 error (r, _("Invalid integer."));
433 /* Reads a string into BUF, which must have room for 256
436 read_string (struct pfm_reader *r, char *buf)
438 int n = read_int (r);
439 if (n < 0 || n > 255)
440 error (r, _("Bad string length %d."), n);
450 /* Reads a string and returns a copy of it allocated from R's
453 read_pool_string (struct pfm_reader *r)
456 read_string (r, string);
457 return pool_strdup (r->pool, string);
460 /* Reads the 464-byte file header. */
462 read_header (struct pfm_reader *r)
467 /* Read and ignore vanity splash strings. */
468 for (i = 0; i < 200; i++)
471 /* Skip the first 64 characters of the translation table.
472 We don't care about these. They are probably all set to
473 '0', marking them as untranslatable, and that would screw
474 up our actual translation of the real '0'. */
475 for (i = 0; i < 64; i++)
478 /* Read the rest of the translation table. */
479 trans = pool_malloc (r->pool, 256);
480 memset (trans, 0, 256);
489 trans[c] = portable_to_local[i];
492 /* Set up the translation table, then read the first
493 translated character. */
497 /* Skip and verify signature. */
498 for (i = 0; i < 8; i++)
499 if (!match (r, "SPSSPORT"[i]))
501 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
502 longjmp (r->bail_out, 1);
506 /* Reads the version and date info record, as well as product and
507 subproduct identification records if present. */
509 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
511 static char empty_string[] = "";
512 char *date, *time, *product, *author, *subproduct;
517 error (r, _("Unrecognized version code `%c'."), r->cc);
518 date = read_pool_string (r);
519 time = read_pool_string (r);
520 product = match (r, '1') ? read_pool_string (r) : empty_string;
521 author = match (r, '2') ? read_pool_string (r) : empty_string;
522 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
525 if (strlen (date) != 8)
526 error (r, _("Bad date string length %zu."), strlen (date));
527 if (strlen (time) != 6)
528 error (r, _("Bad time string length %zu."), strlen (time));
530 /* Save file info. */
534 for (i = 0; i < 8; i++)
536 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
537 info->creation_date[map[i]] = date[i];
539 info->creation_date[2] = info->creation_date[5] = ' ';
540 info->creation_date[10] = 0;
543 for (i = 0; i < 6; i++)
545 static const int map[] = {0, 1, 3, 4, 6, 7};
546 info->creation_time[map[i]] = time[i];
548 info->creation_time[2] = info->creation_time[5] = ' ';
549 info->creation_time[8] = 0;
552 str_copy_trunc (info->product, sizeof info->product, product);
553 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
557 /* Translates a format specification read from portable file R as
558 the three integers INTS into a normal format specifier FORMAT,
559 checking that the format is appropriate for variable V. */
560 static struct fmt_spec
561 convert_format (struct pfm_reader *r, const int portable_format[3],
562 struct variable *v, bool *report_error)
564 struct fmt_spec format;
567 if (!fmt_from_io (portable_format[0], &format.type))
570 warning (r, _("%s: Bad format specifier byte (%d). Variable "
571 "will be assigned a default format."),
572 var_get_name (v), portable_format[0]);
576 format.w = portable_format[1];
577 format.d = portable_format[2];
580 ok = (fmt_check_output (&format)
581 && fmt_check_width_compat (&format, var_get_width (v)));
588 char fmt_string[FMT_STRING_LEN_MAX + 1];
589 fmt_to_string (&format, fmt_string);
590 if (var_is_numeric (v))
591 warning (r, _("Numeric variable %s has invalid format "
593 var_get_name (v), fmt_string);
595 warning (r, _("String variable %s with width %d has "
596 "invalid format specifier %s."),
597 var_get_name (v), var_get_width (v), fmt_string);
605 *report_error = false;
606 return fmt_default_for_width (var_get_width (v));
609 static union value parse_value (struct pfm_reader *, struct variable *);
611 /* Read information on all the variables. */
613 read_variables (struct pfm_reader *r, struct dictionary *dict)
615 char *weight_name = NULL;
619 error (r, _("Expected variable count record."));
621 r->var_cnt = read_int (r);
623 error (r, _("Invalid number of variables %d."), r->var_cnt);
624 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
626 /* Purpose of this value is unknown. It is typically 161. */
631 weight_name = read_pool_string (r);
632 if (strlen (weight_name) > SHORT_NAME_LEN)
633 error (r, _("Weight variable name (%s) truncated."), weight_name);
636 for (i = 0; i < r->var_cnt; i++)
642 struct missing_values miss;
643 struct fmt_spec print, write;
644 bool report_error = true;
648 error (r, _("Expected variable record."));
650 width = read_int (r);
652 error (r, _("Invalid variable width %d."), width);
653 r->widths[i] = width;
655 read_string (r, name);
656 for (j = 0; j < 6; j++)
657 fmt[j] = read_int (r);
659 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
660 error (r, _("Invalid variable name `%s' in position %d."), name, i);
661 str_uppercase (name);
663 if (width < 0 || width > 255)
664 error (r, _("Bad width %d for variable %s."), width, name);
666 v = dict_create_var (dict, name, width);
670 for (i = 1; i < 100000; i++)
672 char try_name[LONG_NAME_LEN + 1];
673 sprintf (try_name, "%.*s_%d", LONG_NAME_LEN - 6, name, i);
674 v = dict_create_var (dict, try_name, width);
679 error (r, _("Duplicate variable name %s in position %d."), name, i);
680 warning (r, _("Duplicate variable name %s in position %d renamed "
681 "to %s."), name, i, var_get_name (v));
684 print = convert_format (r, &fmt[0], v, &report_error);
685 write = convert_format (r, &fmt[3], v, &report_error);
686 var_set_print_format (v, &print);
687 var_set_write_format (v, &write);
689 /* Range missing values. */
690 mv_init (&miss, var_get_width (v));
693 double x = read_float (r);
694 double y = read_float (r);
695 mv_add_num_range (&miss, x, y);
697 else if (match (r, 'A'))
698 mv_add_num_range (&miss, read_float (r), HIGHEST);
699 else if (match (r, '9'))
700 mv_add_num_range (&miss, LOWEST, read_float (r));
702 /* Single missing values. */
703 while (match (r, '8'))
705 union value value = parse_value (r, v);
706 mv_add_value (&miss, &value);
709 var_set_missing_values (v, &miss);
714 read_string (r, label);
715 var_set_label (v, label);
719 if (weight_name != NULL)
721 struct variable *weight_var = dict_lookup_var (dict, weight_name);
722 if (weight_var == NULL)
723 error (r, _("Weighting variable %s not present in dictionary."),
726 dict_set_weight (dict, weight_var);
730 /* Parse a value for variable VV into value V. */
732 parse_value (struct pfm_reader *r, struct variable *vv)
736 if (var_is_alpha (vv))
739 read_string (r, string);
740 buf_copy_str_rpad (v.s, 8, string);
743 v.f = read_float (r);
748 /* Parse a value label record and return success. */
750 read_value_label (struct pfm_reader *r, struct dictionary *dict)
762 v = pool_nalloc (r->pool, nv, sizeof *v);
763 for (i = 0; i < nv; i++)
766 read_string (r, name);
768 v[i] = dict_lookup_var (dict, name);
770 error (r, _("Unknown variable %s while parsing value labels."), name);
772 if (var_get_type (v[0]) != var_get_type (v[i]))
773 error (r, _("Cannot assign value labels to %s and %s, which "
774 "have different variable types."),
775 var_get_name (v[0]), var_get_name (v[i]));
778 n_labels = read_int (r);
779 for (i = 0; i < n_labels; i++)
785 val = parse_value (r, v[0]);
786 read_string (r, label);
788 /* Assign the value label to each variable. */
789 for (j = 0; j < nv; j++)
791 struct variable *var = v[j];
793 if (!var_is_long_string (var))
794 var_replace_value_label (var, &val, label);
799 /* Reads a set of documents from portable file R into DICT. */
801 read_documents (struct pfm_reader *r, struct dictionary *dict)
806 line_cnt = read_int (r);
807 for (i = 0; i < line_cnt; i++)
810 read_string (r, line);
811 dict_add_document_line (dict, line);
815 /* Reads one case from portable file R into C. */
817 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
819 struct pfm_reader *r = r_;
823 case_create (c, casereader_get_value_cnt (reader));
824 setjmp (r->bail_out);
827 casereader_force_error (reader);
832 /* Check for end of file. */
840 for (i = 0; i < r->var_cnt; i++)
842 int width = r->widths[i];
846 case_data_rw_idx (c, idx)->f = read_float (r);
852 read_string (r, string);
853 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
854 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
861 /* Returns true if FILE is an SPSS portable file,
864 pfm_detect (FILE *file)
866 unsigned char header[464];
868 int cooked_cnt, raw_cnt;
871 cooked_cnt = raw_cnt = 0;
872 while (cooked_cnt < sizeof header)
875 if (c == EOF || raw_cnt++ > 512)
877 else if (c != '\n' && c != '\r')
878 header[cooked_cnt++] = c;
881 memset (trans, 0, 256);
882 for (i = 64; i < 256; i++)
884 unsigned char c = header[i + 200];
886 trans[c] = portable_to_local[i];
889 for (i = 0; i < 8; i++)
890 if (trans[header[i + 456]] != "SPSSPORT"[i])
896 static struct casereader_class por_file_casereader_class =
898 por_file_casereader_read,
899 por_file_casereader_destroy,