1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #include "data/any-reader.h"
29 #include "data/casereader-provider.h"
30 #include "data/casereader.h"
31 #include "data/dictionary.h"
32 #include "data/file-handle-def.h"
33 #include "data/file-name.h"
34 #include "data/format.h"
35 #include "data/missing-values.h"
36 #include "data/short-names.h"
37 #include "data/value-labels.h"
38 #include "data/variable.h"
39 #include "libpspp/compiler.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/minmax.h"
47 #include "gl/xalloc.h"
48 #include "gl/xmemdup0.h"
51 #define _(msgid) gettext (msgid)
52 #define N_(msgid) (msgid)
54 /* portable_to_local[PORTABLE] translates the given portable
55 character into the local character set. */
56 static const char portable_to_local[256] =
59 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
60 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
64 /* Portable file reader. */
67 struct any_reader any_reader;
68 struct pool *pool; /* All the portable file state. */
70 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct dictionary *dict;
73 struct any_read_info info;
74 struct file_handle *fh; /* File handle. */
75 struct fh_lock *lock; /* Read lock for file. */
76 FILE *file; /* File stream. */
77 int line_length; /* Number of characters so far on this line. */
78 char cc; /* Current character. */
79 char *trans; /* 256-byte character set translation table. */
80 int n_vars; /* Number of variables. */
81 int weight_index; /* 0-based index of weight variable, or -1. */
82 struct caseproto *proto; /* Format of output cases. */
83 bool ok; /* Set false on I/O error. */
86 static const struct casereader_class por_file_casereader_class;
88 static struct pfm_reader *
89 pfm_reader_cast (const struct any_reader *r_)
91 assert (r_->klass == &por_file_reader_class);
92 return UP_CAST (r_, struct pfm_reader, any_reader);
96 error (struct pfm_reader *r, const char *msg,...)
100 /* Displays MSG as an error message and aborts reading the
101 portable file via longjmp(). */
103 error (struct pfm_reader *r, const char *msg, ...)
108 ds_init_empty (&text);
109 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
110 fh_get_file_name (r->fh), (long long int) ftello (r->file));
111 va_start (args, msg);
112 ds_put_vformat (&text, msg, args);
115 struct msg *m = xmalloc (sizeof *m);
117 .category = MSG_C_GENERAL,
118 .severity = MSG_S_ERROR,
119 .text = ds_steal_cstr (&text),
125 longjmp (r->bail_out, 1);
128 /* Displays MSG as an warning for the current position in
129 portable file reader R. */
131 warning (struct pfm_reader *r, const char *msg, ...)
136 ds_init_empty (&text);
137 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
138 fh_get_file_name (r->fh), (long long int) ftello (r->file));
139 va_start (args, msg);
140 ds_put_vformat (&text, msg, args);
143 struct msg *m = xmalloc (sizeof *m);
145 .category = MSG_C_GENERAL,
146 .severity = MSG_S_WARNING,
147 .text = ds_steal_cstr (&text),
152 /* Close and destroy R.
153 Returns false if an error was detected on R, true otherwise. */
155 pfm_close (struct any_reader *r_)
157 struct pfm_reader *r = pfm_reader_cast (r_);
160 dict_unref (r->dict);
161 any_read_info_destroy (&r->info);
164 if (fn_close (r->fh, r->file) == EOF)
166 msg (ME, _("Error closing portable file `%s': %s."),
167 fh_get_file_name (r->fh), strerror (errno));
177 pool_destroy (r->pool);
182 /* Closes portable file reader R, after we're done with it. */
184 por_file_casereader_destroy (struct casereader *reader, void *r_)
186 struct pfm_reader *r = r_;
187 if (!pfm_close (&r->any_reader))
188 casereader_force_error (reader);
191 /* Read a single character into cur_char. */
193 advance (struct pfm_reader *r)
197 /* Read the next character from the file.
198 Ignore carriage returns entirely.
199 Mostly ignore new-lines, but if a new-line occurs before the
200 line has reached 80 bytes in length, then treat the
201 "missing" bytes as spaces. */
204 while ((c = getc (r->file)) == '\r')
209 if (r->line_length < 80)
212 ungetc ('\n', r->file);
218 error (r, _("Unexpected end of file"));
220 if (r->trans != NULL)
226 /* Skip a single character if present, and return whether it was
229 match (struct pfm_reader *r, int c)
240 static void read_header (struct pfm_reader *);
241 static void read_version_data (struct pfm_reader *, struct any_read_info *);
242 static void read_variables (struct pfm_reader *, struct dictionary *);
243 static void read_value_label (struct pfm_reader *, struct dictionary *);
244 static void read_documents (struct pfm_reader *, struct dictionary *);
246 /* Reads the dictionary from file with handle H, and returns it in a
247 dictionary structure. This dictionary may be modified in order to
248 rename, reorder, and delete variables, etc. */
249 static struct any_reader *
250 pfm_open (struct file_handle *fh)
252 struct pool *volatile pool = NULL;
253 struct pfm_reader *volatile r = NULL;
255 /* Create and initialize reader. */
256 pool = pool_create ();
257 r = pool_alloc (pool, sizeof *r);
258 r->any_reader.klass = &por_file_reader_class;
259 r->dict = dict_create (get_default_encoding ());
260 memset (&r->info, 0, sizeof r->info);
266 r->weight_index = -1;
271 if (setjmp (r->bail_out))
275 /* TRANSLATORS: this fragment will be interpolated into
276 messages in fh_lock() that identify types of files. */
277 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
282 r->file = fn_open (r->fh, "rb");
285 msg (ME, _("An error occurred while opening `%s' for reading "
286 "as a portable file: %s."),
287 fh_get_file_name (r->fh), strerror (errno));
291 /* Read header, version, date info, product id, variables. */
293 read_version_data (r, &r->info);
294 read_variables (r, r->dict);
296 /* Read value labels. */
297 while (match (r, 'D'))
298 read_value_label (r, r->dict);
300 /* Read documents. */
302 read_documents (r, r->dict);
304 /* Check that we've made it to the data. */
306 error (r, _("Data record expected."));
308 r->proto = caseproto_ref_pool (dict_get_proto (r->dict), r->pool);
309 return &r->any_reader;
312 pfm_close (&r->any_reader);
316 static struct casereader *
317 pfm_decode (struct any_reader *r_, const char *encoding UNUSED,
318 struct dictionary **dictp, struct any_read_info *info)
320 struct pfm_reader *r = pfm_reader_cast (r_);
328 memset (&r->info, 0, sizeof r->info);
331 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
332 &por_file_casereader_class, r);
335 /* Returns the value of base-30 digit C,
336 or -1 if C is not a base-30 digit. */
338 base_30_value (unsigned char c)
340 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
341 const char *p = strchr (base_30_digits, c);
342 return p != NULL ? p - base_30_digits : -1;
345 /* Read a floating point value and return its value. */
347 read_float (struct pfm_reader *r)
351 bool got_dot = false; /* Seen a decimal point? */
352 bool got_digit = false; /* Seen any digits? */
353 bool negative = false; /* Number is negative? */
355 /* Skip leading spaces. */
356 while (match (r, ' '))
359 /* `*' indicates system-missing. */
362 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
366 negative = match (r, '-');
369 int digit = base_30_value (r->cc);
374 /* Make sure that multiplication by 30 will not overflow. */
375 if (num > DBL_MAX * (1. / 30.))
376 /* The value of the digit doesn't matter, since we have already
377 gotten as many digits as can be represented in a `double'.
378 This doesn't necessarily mean the result will overflow.
379 The exponent may reduce it to within range.
381 We just need to record that there was another
382 digit so that we can multiply by 10 later. */
385 num = (num * 30.0) + digit;
387 /* Keep track of the number of digits after the decimal point.
388 If we just divided by 30 here, we would lose precision. */
392 else if (!got_dot && r->cc == '.')
393 /* Record that we have found the decimal point. */
396 /* Any other character terminates the number. */
402 /* Check that we had some digits. */
404 error (r, _("Number expected."));
406 /* Get exponent if any. */
407 if (r->cc == '+' || r->cc == '-')
410 bool negative_exponent = r->cc == '-';
413 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
415 if (exp > LONG_MAX / 30)
420 exp = exp * 30 + digit;
423 /* We don't check whether there were actually any digits, but we
425 if (negative_exponent)
430 /* Numbers must end with `/'. */
432 error (r, _("Missing numeric terminator."));
434 /* Multiply `num' by 30 to the `exponent' power, checking for
437 num *= pow (30.0, (double) exponent);
438 else if (exponent > 0)
440 if (num > DBL_MAX * pow (30.0, (double) -exponent))
443 num *= pow (30.0, (double) exponent);
446 return negative ? -num : num;
449 /* Read an integer and return its value. */
451 read_int (struct pfm_reader *r)
453 double f = read_float (r);
454 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
455 error (r, _("Invalid integer."));
459 /* Reads a string into BUF, which must have room for 256
462 read_string (struct pfm_reader *r, char *buf)
464 int n = read_int (r);
465 if (n < 0 || n > 255)
466 error (r, _("Bad string length %d."), n);
477 /* Reads a string into BUF, which must have room for 256
479 Returns the number of bytes read.
482 read_bytes (struct pfm_reader *r, uint8_t *buf)
484 int n = read_int (r);
485 if (n < 0 || n > 255)
486 error (r, _("Bad string length %d."), n);
498 /* Reads a string and returns a copy of it allocated from R's
501 read_pool_string (struct pfm_reader *r)
504 read_string (r, string);
505 return pool_strdup (r->pool, string);
508 /* Reads the 464-byte file header. */
510 read_header (struct pfm_reader *r)
515 /* Read and ignore vanity splash strings. */
516 for (i = 0; i < 200; i++)
519 /* Skip the first 64 characters of the translation table.
520 We don't care about these. They are probably all set to
521 '0', marking them as untranslatable, and that would screw
522 up our actual translation of the real '0'. */
523 for (i = 0; i < 64; i++)
526 /* Read the rest of the translation table. */
527 trans = pool_malloc (r->pool, 256);
528 memset (trans, 0, 256);
537 trans[c] = portable_to_local[i];
540 /* Set up the translation table, then read the first
541 translated character. */
545 /* Skip and verify signature. */
546 for (i = 0; i < 8; i++)
547 if (!match (r, "SPSSPORT"[i]))
549 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
550 longjmp (r->bail_out, 1);
554 /* Reads the version and date info record, as well as product and
555 subproduct identification records if present. */
557 read_version_data (struct pfm_reader *r, struct any_read_info *info)
559 static const char empty_string[] = "";
561 const char *product, *subproduct;
566 error (r, _("Unrecognized version code `%c'."), r->cc);
567 date = read_pool_string (r);
568 time = read_pool_string (r);
569 product = match (r, '1') ? read_pool_string (r) : empty_string;
572 /* Skip "author" field. */
573 read_pool_string (r);
575 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
578 if (strlen (date) != 8)
579 error (r, _("Bad date string length %zu."), strlen (date));
580 if (strlen (time) != 6)
581 error (r, _("Bad time string length %zu."), strlen (time));
583 /* Save file info. */
586 memset (info, 0, sizeof *info);
588 info->float_format = FLOAT_NATIVE_DOUBLE;
589 info->integer_format = INTEGER_NATIVE;
590 info->compression = ANY_COMP_NONE;
594 info->creation_date = xmalloc (11);
595 for (i = 0; i < 8; i++)
597 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
598 info->creation_date[map[i]] = date[i];
600 info->creation_date[2] = info->creation_date[5] = ' ';
601 info->creation_date[10] = '\0';
604 info->creation_time = xmalloc (9);
605 for (i = 0; i < 6; i++)
607 static const int map[] = {0, 1, 3, 4, 6, 7};
608 info->creation_time[map[i]] = time[i];
610 info->creation_time[2] = info->creation_time[5] = ' ';
611 info->creation_time[8] = 0;
614 info->product = xstrdup (product);
615 info->product_ext = xstrdup (subproduct);
619 /* Translates a format specification read from portable file R as
620 the three integers INTS into a normal format specifier FORMAT,
621 checking that the format is appropriate for variable V. */
622 static struct fmt_spec
623 convert_format (struct pfm_reader *r, const int portable_format[3],
624 struct variable *v, bool *report_error)
627 if (fmt_from_io (portable_format[0], &type))
629 struct fmt_spec format = {
631 .w = portable_format[1],
632 .d = portable_format[2],
635 if (fmt_check_output (&format)
636 && fmt_check_width_compat (&format, var_get_width (v)))
641 char fmt_string[FMT_STRING_LEN_MAX + 1];
642 fmt_to_string (&format, fmt_string);
643 if (var_is_numeric (v))
644 warning (r, _("Numeric variable %s has invalid format "
646 var_get_name (v), fmt_string);
648 warning (r, _("String variable %s with width %d has "
649 "invalid format specifier %s."),
650 var_get_name (v), var_get_width (v), fmt_string);
656 warning (r, _("%s: Bad format specifier byte (%d). Variable "
657 "will be assigned a default format."),
658 var_get_name (v), portable_format[0]);
661 *report_error = false;
662 return fmt_default_for_width (var_get_width (v));
665 static void parse_value (struct pfm_reader *, int width, union value *);
667 /* Read information on all the variables. */
669 read_variables (struct pfm_reader *r, struct dictionary *dict)
671 char *weight_name = NULL;
675 error (r, _("Expected variable count record."));
677 r->n_vars = read_int (r);
679 error (r, _("Invalid number of variables %d."), r->n_vars);
686 weight_name = read_pool_string (r);
687 if (strlen (weight_name) > SHORT_NAME_LEN)
688 error (r, _("Weight variable name (%s) truncated."), weight_name);
691 for (i = 0; i < r->n_vars; i++)
697 struct missing_values miss;
698 struct fmt_spec print, write;
699 bool report_error = true;
703 error (r, _("Expected variable record."));
705 width = read_int (r);
707 error (r, _("Invalid variable width %d."), width);
709 read_string (r, name);
710 for (j = 0; j < 6; j++)
711 fmt[j] = read_int (r);
713 if (!dict_id_is_valid (dict, name) || *name == '#' || *name == '$')
714 error (r, _("Invalid variable name `%s' in position %d."), name, i);
715 str_uppercase (name);
717 if (width < 0 || width > 255)
718 error (r, _("Bad width %d for variable %s."), width, name);
720 v = dict_create_var (dict, name, width);
726 char *try_name = xasprintf ("%s_%lu", name, i);
727 v = dict_create_var (dict, try_name, width);
732 warning (r, _("Duplicate variable name %s in position %d renamed "
733 "to %s."), name, i, var_get_name (v));
736 print = convert_format (r, &fmt[0], v, &report_error);
737 write = convert_format (r, &fmt[3], v, &report_error);
738 var_set_print_format (v, &print);
739 var_set_write_format (v, &write);
741 /* Range missing values. */
742 mv_init (&miss, width);
745 double x = read_float (r);
746 double y = read_float (r);
747 mv_add_range (&miss, x, y);
749 else if (match (r, 'A'))
750 mv_add_range (&miss, read_float (r), HIGHEST);
751 else if (match (r, '9'))
752 mv_add_range (&miss, LOWEST, read_float (r));
754 /* Single missing values. */
755 while (match (r, '8'))
757 int mv_width = MIN (width, 8);
760 parse_value (r, mv_width, &value);
761 value_resize (&value, mv_width, width);
762 mv_add_value (&miss, &value);
763 value_destroy (&value, width);
766 var_set_missing_values (v, &miss);
772 read_string (r, label);
773 var_set_label (v, label); /* XXX */
777 if (weight_name != NULL)
779 struct variable *weight_var = dict_lookup_var (dict, weight_name);
780 if (weight_var == NULL)
781 error (r, _("Weighting variable %s not present in dictionary."),
784 dict_set_weight (dict, weight_var);
788 /* Parse a value of with WIDTH into value V. */
790 parse_value (struct pfm_reader *r, int width, union value *v)
792 value_init (v, width);
796 size_t n_bytes = read_bytes (r, buf);
797 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
800 v->f = read_float (r);
803 /* Parse a value label record and return success. */
805 read_value_label (struct pfm_reader *r, struct dictionary *dict)
817 v = pool_nalloc (r->pool, nv, sizeof *v);
818 for (i = 0; i < nv; i++)
821 read_string (r, name);
823 v[i] = dict_lookup_var (dict, name);
825 error (r, _("Unknown variable %s while parsing value labels."), name);
827 if (var_get_type (v[0]) != var_get_type (v[i]))
828 error (r, _("Cannot assign value labels to %s and %s, which "
829 "have different variable types."),
830 var_get_name (v[0]), var_get_name (v[i]));
833 n_labels = read_int (r);
834 for (i = 0; i < n_labels; i++)
840 parse_value (r, var_get_width (v[0]), &val);
841 read_string (r, label);
843 /* Assign the value label to each variable. */
844 for (j = 0; j < nv; j++)
845 var_replace_value_label (v[j], &val, label);
847 value_destroy (&val, var_get_width (v[0]));
851 /* Reads a set of documents from portable file R into DICT. */
853 read_documents (struct pfm_reader *r, struct dictionary *dict)
855 int n_lines = read_int (r);
856 for (int i = 0; i < n_lines; i++)
859 read_string (r, line);
860 dict_add_document_line (dict, line, false);
864 /* Reads and returns one case from portable file R. Returns a
865 null pointer on failure. */
866 static struct ccase *
867 por_file_casereader_read (struct casereader *reader, void *r_)
869 struct pfm_reader *r = r_;
870 struct ccase *volatile c;
873 c = case_create (r->proto);
874 setjmp (r->bail_out);
877 casereader_force_error (reader);
882 /* Check for end of file. */
889 for (i = 0; i < r->n_vars; i++)
891 int width = caseproto_get_width (r->proto, i);
894 *case_num_rw_idx (c, i) = read_float (r);
898 size_t n_bytes = read_bytes (r, buf);
899 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
906 /* Detects whether FILE is an SPSS portable file. Returns 1 if so, 0 if not,
907 and a negative errno value if there is an error reading FILE. */
909 pfm_detect (FILE *file)
911 unsigned char header[464];
913 int n_cooked, n_raws, line_len;
916 n_cooked = n_raws = 0;
918 while (n_cooked < sizeof header)
921 if (c == EOF || n_raws++ > 512)
922 return ferror (file) ? -errno : 0;
925 while (line_len < 80 && n_cooked < sizeof header)
927 header[n_cooked++] = ' ';
934 header[n_cooked++] = c;
939 memset (trans, 0, 256);
940 for (i = 64; i < 256; i++)
942 unsigned char c = header[i + 200];
944 trans[c] = portable_to_local[i];
947 for (i = 0; i < 8; i++)
948 if (trans[header[i + 456]] != "SPSSPORT"[i])
954 static const struct casereader_class por_file_casereader_class =
956 por_file_casereader_read,
957 por_file_casereader_destroy,
962 const struct any_reader_class por_file_reader_class =
964 N_("SPSS Portable File"),
969 NULL, /* get_strings */