1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #include "data/any-reader.h"
29 #include "data/casereader-provider.h"
30 #include "data/casereader.h"
31 #include "data/dictionary.h"
32 #include "data/file-handle-def.h"
33 #include "data/file-name.h"
34 #include "data/format.h"
35 #include "data/missing-values.h"
36 #include "data/short-names.h"
37 #include "data/value-labels.h"
38 #include "data/variable.h"
39 #include "libpspp/compiler.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/minmax.h"
47 #include "gl/xalloc.h"
48 #include "gl/xmemdup0.h"
51 #define _(msgid) gettext (msgid)
52 #define N_(msgid) (msgid)
54 /* portable_to_local[PORTABLE] translates the given portable
55 character into the local character set. */
56 static const char portable_to_local[256] =
59 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
60 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
64 /* Portable file reader. */
67 struct any_reader any_reader;
68 struct pool *pool; /* All the portable file state. */
70 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct dictionary *dict;
73 struct any_read_info info;
74 struct file_handle *fh; /* File handle. */
75 struct fh_lock *lock; /* Read lock for file. */
76 FILE *file; /* File stream. */
77 int line_length; /* Number of characters so far on this line. */
78 char cc; /* Current character. */
79 char *trans; /* 256-byte character set translation table. */
80 int n_vars; /* Number of variables. */
81 int weight_index; /* 0-based index of weight variable, or -1. */
82 struct caseproto *proto; /* Format of output cases. */
83 bool ok; /* Set false on I/O error. */
86 static const struct casereader_class por_file_casereader_class;
88 static struct pfm_reader *
89 pfm_reader_cast (const struct any_reader *r_)
91 assert (r_->klass == &por_file_reader_class);
92 return UP_CAST (r_, struct pfm_reader, any_reader);
96 error (struct pfm_reader *r, const char *msg,...)
100 /* Displays MSG as an error message and aborts reading the
101 portable file via longjmp(). */
103 error (struct pfm_reader *r, const char *msg, ...)
108 ds_init_empty (&text);
109 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
110 fh_get_file_name (r->fh), (long long int) ftello (r->file));
111 va_start (args, msg);
112 ds_put_vformat (&text, msg, args);
115 struct msg *m = xmalloc (sizeof *m);
117 .category = MSG_C_GENERAL,
118 .severity = MSG_S_ERROR,
119 .text = ds_steal_cstr (&text),
125 longjmp (r->bail_out, 1);
128 /* Displays MSG as an warning for the current position in
129 portable file reader R. */
131 warning (struct pfm_reader *r, const char *msg, ...)
136 ds_init_empty (&text);
137 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
138 fh_get_file_name (r->fh), (long long int) ftello (r->file));
139 va_start (args, msg);
140 ds_put_vformat (&text, msg, args);
143 struct msg *m = xmalloc (sizeof *m);
145 .category = MSG_C_GENERAL,
146 .severity = MSG_S_WARNING,
147 .text = ds_steal_cstr (&text),
152 /* Close and destroy R.
153 Returns false if an error was detected on R, true otherwise. */
155 pfm_close (struct any_reader *r_)
157 struct pfm_reader *r = pfm_reader_cast (r_);
160 dict_unref (r->dict);
161 any_read_info_destroy (&r->info);
164 if (fn_close (r->fh, r->file) == EOF)
166 msg (ME, _("Error closing portable file `%s': %s."),
167 fh_get_file_name (r->fh), strerror (errno));
177 pool_destroy (r->pool);
182 /* Closes portable file reader R, after we're done with it. */
184 por_file_casereader_destroy (struct casereader *reader, void *r_)
186 struct pfm_reader *r = r_;
187 if (!pfm_close (&r->any_reader))
188 casereader_force_error (reader);
191 /* Read a single character into cur_char. */
193 advance (struct pfm_reader *r)
197 /* Read the next character from the file.
198 Ignore carriage returns entirely.
199 Mostly ignore new-lines, but if a new-line occurs before the
200 line has reached 80 bytes in length, then treat the
201 "missing" bytes as spaces. */
204 while ((c = getc (r->file)) == '\r')
209 if (r->line_length < 80)
212 ungetc ('\n', r->file);
218 error (r, _("unexpected end of file"));
220 if (r->trans != NULL)
226 /* Skip a single character if present, and return whether it was
229 match (struct pfm_reader *r, int c)
240 static void read_header (struct pfm_reader *);
241 static void read_version_data (struct pfm_reader *, struct any_read_info *);
242 static void read_variables (struct pfm_reader *, struct dictionary *);
243 static void read_value_label (struct pfm_reader *, struct dictionary *);
244 static void read_documents (struct pfm_reader *, struct dictionary *);
246 /* Reads the dictionary from file with handle H, and returns it in a
247 dictionary structure. This dictionary may be modified in order to
248 rename, reorder, and delete variables, etc. */
249 static struct any_reader *
250 pfm_open (struct file_handle *fh)
252 struct pool *volatile pool = NULL;
253 struct pfm_reader *volatile r = NULL;
255 /* Create and initialize reader. */
256 pool = pool_create ();
257 r = pool_alloc (pool, sizeof *r);
258 r->any_reader.klass = &por_file_reader_class;
259 r->dict = dict_create (get_default_encoding ());
260 memset (&r->info, 0, sizeof r->info);
266 r->weight_index = -1;
271 if (setjmp (r->bail_out))
275 /* TRANSLATORS: this fragment will be interpolated into
276 messages in fh_lock() that identify types of files. */
277 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
282 r->file = fn_open (r->fh, "rb");
285 msg (ME, _("An error occurred while opening `%s' for reading "
286 "as a portable file: %s."),
287 fh_get_file_name (r->fh), strerror (errno));
291 /* Read header, version, date info, product id, variables. */
293 read_version_data (r, &r->info);
294 read_variables (r, r->dict);
296 /* Read value labels. */
297 while (match (r, 'D'))
298 read_value_label (r, r->dict);
300 /* Read documents. */
302 read_documents (r, r->dict);
304 /* Check that we've made it to the data. */
306 error (r, _("Data record expected."));
308 r->proto = caseproto_ref_pool (dict_get_proto (r->dict), r->pool);
309 return &r->any_reader;
312 pfm_close (&r->any_reader);
316 static struct casereader *
317 pfm_decode (struct any_reader *r_, const char *encoding UNUSED,
318 struct dictionary **dictp, struct any_read_info *info)
320 struct pfm_reader *r = pfm_reader_cast (r_);
328 memset (&r->info, 0, sizeof r->info);
331 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
332 &por_file_casereader_class, r);
335 /* Returns the value of base-30 digit C,
336 or -1 if C is not a base-30 digit. */
338 base_30_value (unsigned char c)
340 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
341 const char *p = strchr (base_30_digits, c);
342 return p != NULL ? p - base_30_digits : -1;
345 /* Read a floating point value and return its value. */
347 read_float (struct pfm_reader *r)
351 bool got_dot = false; /* Seen a decimal point? */
352 bool got_digit = false; /* Seen any digits? */
353 bool negative = false; /* Number is negative? */
355 /* Skip leading spaces. */
356 while (match (r, ' '))
359 /* `*' indicates system-missing. */
362 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
366 negative = match (r, '-');
369 int digit = base_30_value (r->cc);
374 /* Make sure that multiplication by 30 will not overflow. */
375 if (num > DBL_MAX * (1. / 30.))
376 /* The value of the digit doesn't matter, since we have already
377 gotten as many digits as can be represented in a `double'.
378 This doesn't necessarily mean the result will overflow.
379 The exponent may reduce it to within range.
381 We just need to record that there was another
382 digit so that we can multiply by 10 later. */
385 num = (num * 30.0) + digit;
387 /* Keep track of the number of digits after the decimal point.
388 If we just divided by 30 here, we would lose precision. */
392 else if (!got_dot && r->cc == '.')
393 /* Record that we have found the decimal point. */
396 /* Any other character terminates the number. */
402 /* Check that we had some digits. */
404 error (r, _("Number expected."));
406 /* Get exponent if any. */
407 if (r->cc == '+' || r->cc == '-')
410 bool negative_exponent = r->cc == '-';
413 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
415 if (exp > LONG_MAX / 30)
420 exp = exp * 30 + digit;
423 /* We don't check whether there were actually any digits, but we
425 if (negative_exponent)
430 /* Numbers must end with `/'. */
432 error (r, _("Missing numeric terminator."));
434 /* Multiply `num' by 30 to the `exponent' power, checking for
437 num *= pow (30.0, (double) exponent);
438 else if (exponent > 0)
440 if (num > DBL_MAX * pow (30.0, (double) -exponent))
443 num *= pow (30.0, (double) exponent);
446 return negative ? -num : num;
449 /* Read an integer and return its value. */
451 read_int (struct pfm_reader *r)
453 double f = read_float (r);
454 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
455 error (r, _("Invalid integer."));
459 /* Reads a string into BUF, which must have room for 256
462 read_string (struct pfm_reader *r, char *buf)
464 int n = read_int (r);
465 if (n < 0 || n > 255)
466 error (r, _("Bad string length %d."), n);
477 /* Reads a string into BUF, which must have room for 256
479 Returns the number of bytes read.
482 read_bytes (struct pfm_reader *r, uint8_t *buf)
484 int n = read_int (r);
485 if (n < 0 || n > 255)
486 error (r, _("Bad string length %d."), n);
498 /* Reads a string and returns a copy of it allocated from R's
501 read_pool_string (struct pfm_reader *r)
504 read_string (r, string);
505 return pool_strdup (r->pool, string);
508 /* Reads the 464-byte file header. */
510 read_header (struct pfm_reader *r)
515 /* Read and ignore vanity splash strings. */
516 for (i = 0; i < 200; i++)
519 /* Skip the first 64 characters of the translation table.
520 We don't care about these. They are probably all set to
521 '0', marking them as untranslatable, and that would screw
522 up our actual translation of the real '0'. */
523 for (i = 0; i < 64; i++)
526 /* Read the rest of the translation table. */
527 trans = pool_malloc (r->pool, 256);
528 memset (trans, 0, 256);
537 trans[c] = portable_to_local[i];
540 /* Set up the translation table, then read the first
541 translated character. */
545 /* Skip and verify signature. */
546 for (i = 0; i < 8; i++)
547 if (!match (r, "SPSSPORT"[i]))
549 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
550 longjmp (r->bail_out, 1);
554 /* Reads the version and date info record, as well as product and
555 subproduct identification records if present. */
557 read_version_data (struct pfm_reader *r, struct any_read_info *info)
559 static const char empty_string[] = "";
561 const char *product, *subproduct;
566 error (r, _("Unrecognized version code `%c'."), r->cc);
567 date = read_pool_string (r);
568 time = read_pool_string (r);
569 product = match (r, '1') ? read_pool_string (r) : empty_string;
572 /* Skip "author" field. */
573 read_pool_string (r);
575 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
578 if (strlen (date) != 8)
579 error (r, _("Bad date string length %zu."), strlen (date));
580 if (strlen (time) != 6)
581 error (r, _("Bad time string length %zu."), strlen (time));
583 /* Save file info. */
586 memset (info, 0, sizeof *info);
588 info->float_format = FLOAT_NATIVE_DOUBLE;
589 info->integer_format = INTEGER_NATIVE;
590 info->compression = ANY_COMP_NONE;
594 info->creation_date = xmalloc (11);
595 for (i = 0; i < 8; i++)
597 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
598 info->creation_date[map[i]] = date[i];
600 info->creation_date[2] = info->creation_date[5] = ' ';
601 info->creation_date[10] = '\0';
604 info->creation_time = xmalloc (9);
605 for (i = 0; i < 6; i++)
607 static const int map[] = {0, 1, 3, 4, 6, 7};
608 info->creation_time[map[i]] = time[i];
610 info->creation_time[2] = info->creation_time[5] = ' ';
611 info->creation_time[8] = 0;
614 info->product = xstrdup (product);
615 info->product_ext = xstrdup (subproduct);
619 /* Translates a format specification read from portable file R as
620 the three integers INTS into a normal format specifier FORMAT,
621 checking that the format is appropriate for variable V. */
622 static struct fmt_spec
623 convert_format (struct pfm_reader *r, const int portable_format[3],
624 struct variable *v, bool *report_error)
626 struct fmt_spec format;
629 if (!fmt_from_io (portable_format[0], &format.type))
632 warning (r, _("%s: Bad format specifier byte (%d). Variable "
633 "will be assigned a default format."),
634 var_get_name (v), portable_format[0]);
638 format.w = portable_format[1];
639 format.d = portable_format[2];
642 ok = (fmt_check_output (&format)
643 && fmt_check_width_compat (&format, var_get_width (v)));
650 char fmt_string[FMT_STRING_LEN_MAX + 1];
651 fmt_to_string (&format, fmt_string);
652 if (var_is_numeric (v))
653 warning (r, _("Numeric variable %s has invalid format "
655 var_get_name (v), fmt_string);
657 warning (r, _("String variable %s with width %d has "
658 "invalid format specifier %s."),
659 var_get_name (v), var_get_width (v), fmt_string);
667 *report_error = false;
668 return fmt_default_for_width (var_get_width (v));
671 static void parse_value (struct pfm_reader *, int width, union value *);
673 /* Read information on all the variables. */
675 read_variables (struct pfm_reader *r, struct dictionary *dict)
677 char *weight_name = NULL;
681 error (r, _("Expected variable count record."));
683 r->n_vars = read_int (r);
685 error (r, _("Invalid number of variables %d."), r->n_vars);
692 weight_name = read_pool_string (r);
693 if (strlen (weight_name) > SHORT_NAME_LEN)
694 error (r, _("Weight variable name (%s) truncated."), weight_name);
697 for (i = 0; i < r->n_vars; i++)
703 struct missing_values miss;
704 struct fmt_spec print, write;
705 bool report_error = true;
709 error (r, _("Expected variable record."));
711 width = read_int (r);
713 error (r, _("Invalid variable width %d."), width);
715 read_string (r, name);
716 for (j = 0; j < 6; j++)
717 fmt[j] = read_int (r);
719 if (!dict_id_is_valid (dict, name, false)
720 || *name == '#' || *name == '$')
721 error (r, _("Invalid variable name `%s' in position %d."), name, i);
722 str_uppercase (name);
724 if (width < 0 || width > 255)
725 error (r, _("Bad width %d for variable %s."), width, name);
727 v = dict_create_var (dict, name, width);
733 char *try_name = xasprintf ("%s_%lu", name, i);
734 v = dict_create_var (dict, try_name, width);
739 warning (r, _("Duplicate variable name %s in position %d renamed "
740 "to %s."), name, i, var_get_name (v));
743 print = convert_format (r, &fmt[0], v, &report_error);
744 write = convert_format (r, &fmt[3], v, &report_error);
745 var_set_print_format (v, &print);
746 var_set_write_format (v, &write);
748 /* Range missing values. */
749 mv_init (&miss, width);
752 double x = read_float (r);
753 double y = read_float (r);
754 mv_add_range (&miss, x, y);
756 else if (match (r, 'A'))
757 mv_add_range (&miss, read_float (r), HIGHEST);
758 else if (match (r, '9'))
759 mv_add_range (&miss, LOWEST, read_float (r));
761 /* Single missing values. */
762 while (match (r, '8'))
764 int mv_width = MIN (width, 8);
767 parse_value (r, mv_width, &value);
768 value_resize (&value, mv_width, width);
769 mv_add_value (&miss, &value);
770 value_destroy (&value, width);
773 var_set_missing_values (v, &miss);
779 read_string (r, label);
780 var_set_label (v, label); /* XXX */
784 if (weight_name != NULL)
786 struct variable *weight_var = dict_lookup_var (dict, weight_name);
787 if (weight_var == NULL)
788 error (r, _("Weighting variable %s not present in dictionary."),
791 dict_set_weight (dict, weight_var);
795 /* Parse a value of with WIDTH into value V. */
797 parse_value (struct pfm_reader *r, int width, union value *v)
799 value_init (v, width);
803 size_t n_bytes = read_bytes (r, buf);
804 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
807 v->f = read_float (r);
810 /* Parse a value label record and return success. */
812 read_value_label (struct pfm_reader *r, struct dictionary *dict)
824 v = pool_nalloc (r->pool, nv, sizeof *v);
825 for (i = 0; i < nv; i++)
828 read_string (r, name);
830 v[i] = dict_lookup_var (dict, name);
832 error (r, _("Unknown variable %s while parsing value labels."), name);
834 if (var_get_type (v[0]) != var_get_type (v[i]))
835 error (r, _("Cannot assign value labels to %s and %s, which "
836 "have different variable types."),
837 var_get_name (v[0]), var_get_name (v[i]));
840 n_labels = read_int (r);
841 for (i = 0; i < n_labels; i++)
847 parse_value (r, var_get_width (v[0]), &val);
848 read_string (r, label);
850 /* Assign the value label to each variable. */
851 for (j = 0; j < nv; j++)
852 var_replace_value_label (v[j], &val, label);
854 value_destroy (&val, var_get_width (v[0]));
858 /* Reads a set of documents from portable file R into DICT. */
860 read_documents (struct pfm_reader *r, struct dictionary *dict)
862 int n_lines = read_int (r);
863 for (int i = 0; i < n_lines; i++)
866 read_string (r, line);
867 dict_add_document_line (dict, line, false);
871 /* Reads and returns one case from portable file R. Returns a
872 null pointer on failure. */
873 static struct ccase *
874 por_file_casereader_read (struct casereader *reader, void *r_)
876 struct pfm_reader *r = r_;
877 struct ccase *volatile c;
880 c = case_create (r->proto);
881 setjmp (r->bail_out);
884 casereader_force_error (reader);
889 /* Check for end of file. */
896 for (i = 0; i < r->n_vars; i++)
898 int width = caseproto_get_width (r->proto, i);
901 *case_num_rw_idx (c, i) = read_float (r);
905 size_t n_bytes = read_bytes (r, buf);
906 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
913 /* Detects whether FILE is an SPSS portable file. Returns 1 if so, 0 if not,
914 and a negative errno value if there is an error reading FILE. */
916 pfm_detect (FILE *file)
918 unsigned char header[464];
920 int n_cooked, n_raws, line_len;
923 n_cooked = n_raws = 0;
925 while (n_cooked < sizeof header)
928 if (c == EOF || n_raws++ > 512)
929 return ferror (file) ? -errno : 0;
932 while (line_len < 80 && n_cooked < sizeof header)
934 header[n_cooked++] = ' ';
941 header[n_cooked++] = c;
946 memset (trans, 0, 256);
947 for (i = 64; i < 256; i++)
949 unsigned char c = header[i + 200];
951 trans[c] = portable_to_local[i];
954 for (i = 0; i < 8; i++)
955 if (trans[header[i + 456]] != "SPSSPORT"[i])
961 static const struct casereader_class por_file_casereader_class =
963 por_file_casereader_read,
964 por_file_casereader_destroy,
969 const struct any_reader_class por_file_reader_class =
971 N_("SPSS Portable File"),
976 NULL, /* get_strings */