1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #include "data/any-reader.h"
29 #include "data/casereader-provider.h"
30 #include "data/casereader.h"
31 #include "data/dictionary.h"
32 #include "data/file-handle-def.h"
33 #include "data/file-name.h"
34 #include "data/format.h"
35 #include "data/missing-values.h"
36 #include "data/short-names.h"
37 #include "data/value-labels.h"
38 #include "data/variable.h"
39 #include "libpspp/compiler.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/intprops.h"
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
49 #include "gl/xmemdup0.h"
52 #define _(msgid) gettext (msgid)
53 #define N_(msgid) (msgid)
55 /* portable_to_local[PORTABLE] translates the given portable
56 character into the local character set. */
57 static const char portable_to_local[256] =
60 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
61 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
65 /* Portable file reader. */
68 struct any_reader any_reader;
69 struct pool *pool; /* All the portable file state. */
71 jmp_buf bail_out; /* longjmp() target for error handling. */
73 struct dictionary *dict;
74 struct any_read_info info;
75 struct file_handle *fh; /* File handle. */
76 struct fh_lock *lock; /* Read lock for file. */
77 FILE *file; /* File stream. */
78 int line_length; /* Number of characters so far on this line. */
79 char cc; /* Current character. */
80 char *trans; /* 256-byte character set translation table. */
81 int var_cnt; /* Number of variables. */
82 int weight_index; /* 0-based index of weight variable, or -1. */
83 struct caseproto *proto; /* Format of output cases. */
84 bool ok; /* Set false on I/O error. */
87 static const struct casereader_class por_file_casereader_class;
89 static struct pfm_reader *
90 pfm_reader_cast (const struct any_reader *r_)
92 assert (r_->klass == &por_file_reader_class);
93 return UP_CAST (r_, struct pfm_reader, any_reader);
97 error (struct pfm_reader *r, const char *msg,...)
101 /* Displays MSG as an error message and aborts reading the
102 portable file via longjmp(). */
104 error (struct pfm_reader *r, const char *msg, ...)
109 ds_init_empty (&text);
110 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
111 fh_get_file_name (r->fh), (long long int) ftello (r->file));
112 va_start (args, msg);
113 ds_put_vformat (&text, msg, args);
117 .category = MSG_C_GENERAL,
118 .severity = MSG_S_ERROR,
119 .text = ds_cstr (&text),
125 longjmp (r->bail_out, 1);
128 /* Displays MSG as an warning for the current position in
129 portable file reader R. */
131 warning (struct pfm_reader *r, const char *msg, ...)
136 ds_init_empty (&text);
137 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
138 fh_get_file_name (r->fh), (long long int) ftello (r->file));
139 va_start (args, msg);
140 ds_put_vformat (&text, msg, args);
144 .category = MSG_C_GENERAL,
145 .severity = MSG_S_WARNING,
146 .text = ds_cstr (&text),
151 /* Close and destroy R.
152 Returns false if an error was detected on R, true otherwise. */
154 pfm_close (struct any_reader *r_)
156 struct pfm_reader *r = pfm_reader_cast (r_);
159 dict_unref (r->dict);
160 any_read_info_destroy (&r->info);
163 if (fn_close (r->fh, r->file) == EOF)
165 msg (ME, _("Error closing portable file `%s': %s."),
166 fh_get_file_name (r->fh), strerror (errno));
176 pool_destroy (r->pool);
181 /* Closes portable file reader R, after we're done with it. */
183 por_file_casereader_destroy (struct casereader *reader, void *r_)
185 struct pfm_reader *r = r_;
186 if (!pfm_close (&r->any_reader))
187 casereader_force_error (reader);
190 /* Read a single character into cur_char. */
192 advance (struct pfm_reader *r)
196 /* Read the next character from the file.
197 Ignore carriage returns entirely.
198 Mostly ignore new-lines, but if a new-line occurs before the
199 line has reached 80 bytes in length, then treat the
200 "missing" bytes as spaces. */
203 while ((c = getc (r->file)) == '\r')
208 if (r->line_length < 80)
211 ungetc ('\n', r->file);
217 error (r, _("unexpected end of file"));
219 if (r->trans != NULL)
225 /* Skip a single character if present, and return whether it was
228 match (struct pfm_reader *r, int c)
239 static void read_header (struct pfm_reader *);
240 static void read_version_data (struct pfm_reader *, struct any_read_info *);
241 static void read_variables (struct pfm_reader *, struct dictionary *);
242 static void read_value_label (struct pfm_reader *, struct dictionary *);
243 static void read_documents (struct pfm_reader *, struct dictionary *);
245 /* Reads the dictionary from file with handle H, and returns it in a
246 dictionary structure. This dictionary may be modified in order to
247 rename, reorder, and delete variables, etc. */
248 static struct any_reader *
249 pfm_open (struct file_handle *fh)
251 struct pool *volatile pool = NULL;
252 struct pfm_reader *volatile r = NULL;
254 /* Create and initialize reader. */
255 pool = pool_create ();
256 r = pool_alloc (pool, sizeof *r);
257 r->any_reader.klass = &por_file_reader_class;
258 r->dict = dict_create (get_default_encoding ());
259 memset (&r->info, 0, sizeof r->info);
265 r->weight_index = -1;
270 if (setjmp (r->bail_out))
274 /* TRANSLATORS: this fragment will be interpolated into
275 messages in fh_lock() that identify types of files. */
276 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
281 r->file = fn_open (r->fh, "rb");
284 msg (ME, _("An error occurred while opening `%s' for reading "
285 "as a portable file: %s."),
286 fh_get_file_name (r->fh), strerror (errno));
290 /* Read header, version, date info, product id, variables. */
292 read_version_data (r, &r->info);
293 read_variables (r, r->dict);
295 /* Read value labels. */
296 while (match (r, 'D'))
297 read_value_label (r, r->dict);
299 /* Read documents. */
301 read_documents (r, r->dict);
303 /* Check that we've made it to the data. */
305 error (r, _("Data record expected."));
307 r->proto = caseproto_ref_pool (dict_get_proto (r->dict), r->pool);
308 return &r->any_reader;
311 pfm_close (&r->any_reader);
315 static struct casereader *
316 pfm_decode (struct any_reader *r_, const char *encoding UNUSED,
317 struct dictionary **dictp, struct any_read_info *info)
319 struct pfm_reader *r = pfm_reader_cast (r_);
327 memset (&r->info, 0, sizeof r->info);
330 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
331 &por_file_casereader_class, r);
334 /* Returns the value of base-30 digit C,
335 or -1 if C is not a base-30 digit. */
337 base_30_value (unsigned char c)
339 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
340 const char *p = strchr (base_30_digits, c);
341 return p != NULL ? p - base_30_digits : -1;
344 /* Read a floating point value and return its value. */
346 read_float (struct pfm_reader *r)
350 bool got_dot = false; /* Seen a decimal point? */
351 bool got_digit = false; /* Seen any digits? */
352 bool negative = false; /* Number is negative? */
354 /* Skip leading spaces. */
355 while (match (r, ' '))
358 /* `*' indicates system-missing. */
361 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
365 negative = match (r, '-');
368 int digit = base_30_value (r->cc);
373 /* Make sure that multiplication by 30 will not overflow. */
374 if (num > DBL_MAX * (1. / 30.))
375 /* The value of the digit doesn't matter, since we have already
376 gotten as many digits as can be represented in a `double'.
377 This doesn't necessarily mean the result will overflow.
378 The exponent may reduce it to within range.
380 We just need to record that there was another
381 digit so that we can multiply by 10 later. */
384 num = (num * 30.0) + digit;
386 /* Keep track of the number of digits after the decimal point.
387 If we just divided by 30 here, we would lose precision. */
391 else if (!got_dot && r->cc == '.')
392 /* Record that we have found the decimal point. */
395 /* Any other character terminates the number. */
401 /* Check that we had some digits. */
403 error (r, _("Number expected."));
405 /* Get exponent if any. */
406 if (r->cc == '+' || r->cc == '-')
409 bool negative_exponent = r->cc == '-';
412 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
414 if (exp > LONG_MAX / 30)
419 exp = exp * 30 + digit;
422 /* We don't check whether there were actually any digits, but we
424 if (negative_exponent)
429 /* Numbers must end with `/'. */
431 error (r, _("Missing numeric terminator."));
433 /* Multiply `num' by 30 to the `exponent' power, checking for
436 num *= pow (30.0, (double) exponent);
437 else if (exponent > 0)
439 if (num > DBL_MAX * pow (30.0, (double) -exponent))
442 num *= pow (30.0, (double) exponent);
445 return negative ? -num : num;
448 /* Read an integer and return its value. */
450 read_int (struct pfm_reader *r)
452 double f = read_float (r);
453 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
454 error (r, _("Invalid integer."));
458 /* Reads a string into BUF, which must have room for 256
461 read_string (struct pfm_reader *r, char *buf)
463 int n = read_int (r);
464 if (n < 0 || n > 255)
465 error (r, _("Bad string length %d."), n);
476 /* Reads a string into BUF, which must have room for 256
478 Returns the number of bytes read.
481 read_bytes (struct pfm_reader *r, uint8_t *buf)
483 int n = read_int (r);
484 if (n < 0 || n > 255)
485 error (r, _("Bad string length %d."), n);
497 /* Reads a string and returns a copy of it allocated from R's
500 read_pool_string (struct pfm_reader *r)
503 read_string (r, string);
504 return pool_strdup (r->pool, string);
507 /* Reads the 464-byte file header. */
509 read_header (struct pfm_reader *r)
514 /* Read and ignore vanity splash strings. */
515 for (i = 0; i < 200; i++)
518 /* Skip the first 64 characters of the translation table.
519 We don't care about these. They are probably all set to
520 '0', marking them as untranslatable, and that would screw
521 up our actual translation of the real '0'. */
522 for (i = 0; i < 64; i++)
525 /* Read the rest of the translation table. */
526 trans = pool_malloc (r->pool, 256);
527 memset (trans, 0, 256);
536 trans[c] = portable_to_local[i];
539 /* Set up the translation table, then read the first
540 translated character. */
544 /* Skip and verify signature. */
545 for (i = 0; i < 8; i++)
546 if (!match (r, "SPSSPORT"[i]))
548 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
549 longjmp (r->bail_out, 1);
553 /* Reads the version and date info record, as well as product and
554 subproduct identification records if present. */
556 read_version_data (struct pfm_reader *r, struct any_read_info *info)
558 static const char empty_string[] = "";
560 const char *product, *subproduct;
565 error (r, _("Unrecognized version code `%c'."), r->cc);
566 date = read_pool_string (r);
567 time = read_pool_string (r);
568 product = match (r, '1') ? read_pool_string (r) : empty_string;
571 /* Skip "author" field. */
572 read_pool_string (r);
574 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
577 if (strlen (date) != 8)
578 error (r, _("Bad date string length %zu."), strlen (date));
579 if (strlen (time) != 6)
580 error (r, _("Bad time string length %zu."), strlen (time));
582 /* Save file info. */
585 memset (info, 0, sizeof *info);
587 info->float_format = FLOAT_NATIVE_DOUBLE;
588 info->integer_format = INTEGER_NATIVE;
589 info->compression = ANY_COMP_NONE;
593 info->creation_date = xmalloc (11);
594 for (i = 0; i < 8; i++)
596 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
597 info->creation_date[map[i]] = date[i];
599 info->creation_date[2] = info->creation_date[5] = ' ';
600 info->creation_date[10] = '\0';
603 info->creation_time = xmalloc (9);
604 for (i = 0; i < 6; i++)
606 static const int map[] = {0, 1, 3, 4, 6, 7};
607 info->creation_time[map[i]] = time[i];
609 info->creation_time[2] = info->creation_time[5] = ' ';
610 info->creation_time[8] = 0;
613 info->product = xstrdup (product);
614 info->product_ext = xstrdup (subproduct);
618 /* Translates a format specification read from portable file R as
619 the three integers INTS into a normal format specifier FORMAT,
620 checking that the format is appropriate for variable V. */
621 static struct fmt_spec
622 convert_format (struct pfm_reader *r, const int portable_format[3],
623 struct variable *v, bool *report_error)
625 struct fmt_spec format;
628 if (!fmt_from_io (portable_format[0], &format.type))
631 warning (r, _("%s: Bad format specifier byte (%d). Variable "
632 "will be assigned a default format."),
633 var_get_name (v), portable_format[0]);
637 format.w = portable_format[1];
638 format.d = portable_format[2];
641 ok = (fmt_check_output (&format)
642 && fmt_check_width_compat (&format, var_get_width (v)));
649 char fmt_string[FMT_STRING_LEN_MAX + 1];
650 fmt_to_string (&format, fmt_string);
651 if (var_is_numeric (v))
652 warning (r, _("Numeric variable %s has invalid format "
654 var_get_name (v), fmt_string);
656 warning (r, _("String variable %s with width %d has "
657 "invalid format specifier %s."),
658 var_get_name (v), var_get_width (v), fmt_string);
666 *report_error = false;
667 return fmt_default_for_width (var_get_width (v));
670 static void parse_value (struct pfm_reader *, int width, union value *);
672 /* Read information on all the variables. */
674 read_variables (struct pfm_reader *r, struct dictionary *dict)
676 char *weight_name = NULL;
680 error (r, _("Expected variable count record."));
682 r->var_cnt = read_int (r);
684 error (r, _("Invalid number of variables %d."), r->var_cnt);
691 weight_name = read_pool_string (r);
692 if (strlen (weight_name) > SHORT_NAME_LEN)
693 error (r, _("Weight variable name (%s) truncated."), weight_name);
696 for (i = 0; i < r->var_cnt; i++)
702 struct missing_values miss;
703 struct fmt_spec print, write;
704 bool report_error = true;
708 error (r, _("Expected variable record."));
710 width = read_int (r);
712 error (r, _("Invalid variable width %d."), width);
714 read_string (r, name);
715 for (j = 0; j < 6; j++)
716 fmt[j] = read_int (r);
718 if (!dict_id_is_valid (dict, name, false)
719 || *name == '#' || *name == '$')
720 error (r, _("Invalid variable name `%s' in position %d."), name, i);
721 str_uppercase (name);
723 if (width < 0 || width > 255)
724 error (r, _("Bad width %d for variable %s."), width, name);
726 v = dict_create_var (dict, name, width);
732 char try_name[8 + 1 + INT_STRLEN_BOUND (i) + 1];
733 sprintf (try_name, "%s_%lu", name, i);
734 v = dict_create_var (dict, try_name, width);
738 warning (r, _("Duplicate variable name %s in position %d renamed "
739 "to %s."), name, i, var_get_name (v));
742 print = convert_format (r, &fmt[0], v, &report_error);
743 write = convert_format (r, &fmt[3], v, &report_error);
744 var_set_print_format (v, &print);
745 var_set_write_format (v, &write);
747 /* Range missing values. */
748 mv_init (&miss, width);
751 double x = read_float (r);
752 double y = read_float (r);
753 mv_add_range (&miss, x, y);
755 else if (match (r, 'A'))
756 mv_add_range (&miss, read_float (r), HIGHEST);
757 else if (match (r, '9'))
758 mv_add_range (&miss, LOWEST, read_float (r));
760 /* Single missing values. */
761 while (match (r, '8'))
763 int mv_width = MIN (width, 8);
766 parse_value (r, mv_width, &value);
767 value_resize (&value, mv_width, width);
768 mv_add_value (&miss, &value);
769 value_destroy (&value, width);
772 var_set_missing_values (v, &miss);
778 read_string (r, label);
779 var_set_label (v, label); /* XXX */
783 if (weight_name != NULL)
785 struct variable *weight_var = dict_lookup_var (dict, weight_name);
786 if (weight_var == NULL)
787 error (r, _("Weighting variable %s not present in dictionary."),
790 dict_set_weight (dict, weight_var);
794 /* Parse a value of with WIDTH into value V. */
796 parse_value (struct pfm_reader *r, int width, union value *v)
798 value_init (v, width);
802 size_t n_bytes = read_bytes (r, buf);
803 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
806 v->f = read_float (r);
809 /* Parse a value label record and return success. */
811 read_value_label (struct pfm_reader *r, struct dictionary *dict)
823 v = pool_nalloc (r->pool, nv, sizeof *v);
824 for (i = 0; i < nv; i++)
827 read_string (r, name);
829 v[i] = dict_lookup_var (dict, name);
831 error (r, _("Unknown variable %s while parsing value labels."), name);
833 if (var_get_type (v[0]) != var_get_type (v[i]))
834 error (r, _("Cannot assign value labels to %s and %s, which "
835 "have different variable types."),
836 var_get_name (v[0]), var_get_name (v[i]));
839 n_labels = read_int (r);
840 for (i = 0; i < n_labels; i++)
846 parse_value (r, var_get_width (v[0]), &val);
847 read_string (r, label);
849 /* Assign the value label to each variable. */
850 for (j = 0; j < nv; j++)
851 var_replace_value_label (v[j], &val, label);
853 value_destroy (&val, var_get_width (v[0]));
857 /* Reads a set of documents from portable file R into DICT. */
859 read_documents (struct pfm_reader *r, struct dictionary *dict)
864 line_cnt = read_int (r);
865 for (i = 0; i < line_cnt; i++)
868 read_string (r, line);
869 dict_add_document_line (dict, line, false);
873 /* Reads and returns one case from portable file R. Returns a
874 null pointer on failure. */
875 static struct ccase *
876 por_file_casereader_read (struct casereader *reader, void *r_)
878 struct pfm_reader *r = r_;
879 struct ccase *volatile c;
882 c = case_create (r->proto);
883 setjmp (r->bail_out);
886 casereader_force_error (reader);
891 /* Check for end of file. */
898 for (i = 0; i < r->var_cnt; i++)
900 int width = caseproto_get_width (r->proto, i);
903 case_data_rw_idx (c, i)->f = read_float (r);
907 size_t n_bytes = read_bytes (r, buf);
908 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
915 /* Detects whether FILE is an SPSS portable file. Returns 1 if so, 0 if not,
916 and a negative errno value if there is an error reading FILE. */
918 pfm_detect (FILE *file)
920 unsigned char header[464];
922 int cooked_cnt, raw_cnt, line_len;
925 cooked_cnt = raw_cnt = 0;
927 while (cooked_cnt < sizeof header)
930 if (c == EOF || raw_cnt++ > 512)
931 return ferror (file) ? -errno : 0;
934 while (line_len < 80 && cooked_cnt < sizeof header)
936 header[cooked_cnt++] = ' ';
943 header[cooked_cnt++] = c;
948 memset (trans, 0, 256);
949 for (i = 64; i < 256; i++)
951 unsigned char c = header[i + 200];
953 trans[c] = portable_to_local[i];
956 for (i = 0; i < 8; i++)
957 if (trans[header[i + 456]] != "SPSSPORT"[i])
963 static const struct casereader_class por_file_casereader_class =
965 por_file_casereader_read,
966 por_file_casereader_destroy,
971 const struct any_reader_class por_file_reader_class =
973 N_("SPSS Portable File"),
978 NULL, /* get_strings */