1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #include "data/any-reader.h"
29 #include "data/casereader-provider.h"
30 #include "data/casereader.h"
31 #include "data/dictionary.h"
32 #include "data/file-handle-def.h"
33 #include "data/file-name.h"
34 #include "data/format.h"
35 #include "data/missing-values.h"
36 #include "data/short-names.h"
37 #include "data/value-labels.h"
38 #include "data/variable.h"
39 #include "libpspp/compiler.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/intprops.h"
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
49 #include "gl/xmemdup0.h"
52 #define _(msgid) gettext (msgid)
53 #define N_(msgid) (msgid)
55 /* portable_to_local[PORTABLE] translates the given portable
56 character into the local character set. */
57 static const char portable_to_local[256] =
60 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
61 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
65 /* Portable file reader. */
68 struct any_reader any_reader;
69 struct pool *pool; /* All the portable file state. */
71 jmp_buf bail_out; /* longjmp() target for error handling. */
73 struct dictionary *dict;
74 struct any_read_info info;
75 struct file_handle *fh; /* File handle. */
76 struct fh_lock *lock; /* Read lock for file. */
77 FILE *file; /* File stream. */
78 int line_length; /* Number of characters so far on this line. */
79 char cc; /* Current character. */
80 char *trans; /* 256-byte character set translation table. */
81 int var_cnt; /* Number of variables. */
82 int weight_index; /* 0-based index of weight variable, or -1. */
83 struct caseproto *proto; /* Format of output cases. */
84 bool ok; /* Set false on I/O error. */
87 static const struct casereader_class por_file_casereader_class;
89 static struct pfm_reader *
90 pfm_reader_cast (const struct any_reader *r_)
92 assert (r_->klass == &por_file_reader_class);
93 return UP_CAST (r_, struct pfm_reader, any_reader);
97 error (struct pfm_reader *r, const char *msg,...)
101 /* Displays MSG as an error message and aborts reading the
102 portable file via longjmp(). */
104 error (struct pfm_reader *r, const char *msg, ...)
110 ds_init_empty (&text);
111 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
112 fh_get_file_name (r->fh), (long long int) ftello (r->file));
113 va_start (args, msg);
114 ds_put_vformat (&text, msg, args);
117 m.category = MSG_C_GENERAL;
118 m.severity = MSG_S_ERROR;
124 m.text = ds_cstr (&text);
130 longjmp (r->bail_out, 1);
133 /* Displays MSG as an warning for the current position in
134 portable file reader R. */
136 warning (struct pfm_reader *r, const char *msg, ...)
142 ds_init_empty (&text);
143 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
144 fh_get_file_name (r->fh), (long long int) ftello (r->file));
145 va_start (args, msg);
146 ds_put_vformat (&text, msg, args);
149 m.category = MSG_C_GENERAL;
150 m.severity = MSG_S_WARNING;
156 m.text = ds_cstr (&text);
161 /* Close and destroy R.
162 Returns false if an error was detected on R, true otherwise. */
164 pfm_close (struct any_reader *r_)
166 struct pfm_reader *r = pfm_reader_cast (r_);
169 dict_destroy (r->dict);
170 any_read_info_destroy (&r->info);
173 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
175 msg (ME, _("Error closing portable file `%s': %s."),
176 fh_get_file_name (r->fh), strerror (errno));
186 pool_destroy (r->pool);
191 /* Closes portable file reader R, after we're done with it. */
193 por_file_casereader_destroy (struct casereader *reader, void *r_)
195 struct pfm_reader *r = r_;
196 if (!pfm_close (&r->any_reader))
197 casereader_force_error (reader);
200 /* Read a single character into cur_char. */
202 advance (struct pfm_reader *r)
206 /* Read the next character from the file.
207 Ignore carriage returns entirely.
208 Mostly ignore new-lines, but if a new-line occurs before the
209 line has reached 80 bytes in length, then treat the
210 "missing" bytes as spaces. */
213 while ((c = getc (r->file)) == '\r')
218 if (r->line_length < 80)
221 ungetc ('\n', r->file);
227 error (r, _("unexpected end of file"));
229 if (r->trans != NULL)
235 /* Skip a single character if present, and return whether it was
238 match (struct pfm_reader *r, int c)
249 static void read_header (struct pfm_reader *);
250 static void read_version_data (struct pfm_reader *, struct any_read_info *);
251 static void read_variables (struct pfm_reader *, struct dictionary *);
252 static void read_value_label (struct pfm_reader *, struct dictionary *);
253 static void read_documents (struct pfm_reader *, struct dictionary *);
255 /* Reads the dictionary from file with handle H, and returns it in a
256 dictionary structure. This dictionary may be modified in order to
257 rename, reorder, and delete variables, etc. */
259 pfm_open (struct file_handle *fh)
261 struct pool *volatile pool = NULL;
262 struct pfm_reader *volatile r = NULL;
264 /* Create and initialize reader. */
265 pool = pool_create ();
266 r = pool_alloc (pool, sizeof *r);
267 r->any_reader.klass = &por_file_reader_class;
268 r->dict = dict_create (get_default_encoding ());
269 memset (&r->info, 0, sizeof r->info);
275 r->weight_index = -1;
280 if (setjmp (r->bail_out))
284 /* TRANSLATORS: this fragment will be interpolated into
285 messages in fh_lock() that identify types of files. */
286 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
291 r->file = fn_open (fh_get_file_name (r->fh), "rb");
294 msg (ME, _("An error occurred while opening `%s' for reading "
295 "as a portable file: %s."),
296 fh_get_file_name (r->fh), strerror (errno));
300 /* Read header, version, date info, product id, variables. */
302 read_version_data (r, &r->info);
303 read_variables (r, r->dict);
305 /* Read value labels. */
306 while (match (r, 'D'))
307 read_value_label (r, r->dict);
309 /* Read documents. */
311 read_documents (r, r->dict);
313 /* Check that we've made it to the data. */
315 error (r, _("Data record expected."));
317 r->proto = caseproto_ref_pool (dict_get_proto (r->dict), r->pool);
318 return &r->any_reader;
321 pfm_close (&r->any_reader);
326 pfm_decode (struct any_reader *r_, const char *encoding UNUSED,
327 struct dictionary **dictp, struct any_read_info *info)
329 struct pfm_reader *r = pfm_reader_cast (r_);
337 memset (&r->info, 0, sizeof r->info);
340 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
341 &por_file_casereader_class, r);
344 /* Returns the value of base-30 digit C,
345 or -1 if C is not a base-30 digit. */
347 base_30_value (unsigned char c)
349 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
350 const char *p = strchr (base_30_digits, c);
351 return p != NULL ? p - base_30_digits : -1;
354 /* Read a floating point value and return its value. */
356 read_float (struct pfm_reader *r)
360 bool got_dot = false; /* Seen a decimal point? */
361 bool got_digit = false; /* Seen any digits? */
362 bool negative = false; /* Number is negative? */
364 /* Skip leading spaces. */
365 while (match (r, ' '))
368 /* `*' indicates system-missing. */
371 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
375 negative = match (r, '-');
378 int digit = base_30_value (r->cc);
383 /* Make sure that multiplication by 30 will not overflow. */
384 if (num > DBL_MAX * (1. / 30.))
385 /* The value of the digit doesn't matter, since we have already
386 gotten as many digits as can be represented in a `double'.
387 This doesn't necessarily mean the result will overflow.
388 The exponent may reduce it to within range.
390 We just need to record that there was another
391 digit so that we can multiply by 10 later. */
394 num = (num * 30.0) + digit;
396 /* Keep track of the number of digits after the decimal point.
397 If we just divided by 30 here, we would lose precision. */
401 else if (!got_dot && r->cc == '.')
402 /* Record that we have found the decimal point. */
405 /* Any other character terminates the number. */
411 /* Check that we had some digits. */
413 error (r, _("Number expected."));
415 /* Get exponent if any. */
416 if (r->cc == '+' || r->cc == '-')
419 bool negative_exponent = r->cc == '-';
422 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
424 if (exp > LONG_MAX / 30)
429 exp = exp * 30 + digit;
432 /* We don't check whether there were actually any digits, but we
434 if (negative_exponent)
439 /* Numbers must end with `/'. */
441 error (r, _("Missing numeric terminator."));
443 /* Multiply `num' by 30 to the `exponent' power, checking for
446 num *= pow (30.0, (double) exponent);
447 else if (exponent > 0)
449 if (num > DBL_MAX * pow (30.0, (double) -exponent))
452 num *= pow (30.0, (double) exponent);
455 return negative ? -num : num;
458 /* Read an integer and return its value. */
460 read_int (struct pfm_reader *r)
462 double f = read_float (r);
463 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
464 error (r, _("Invalid integer."));
468 /* Reads a string into BUF, which must have room for 256
471 read_string (struct pfm_reader *r, char *buf)
473 int n = read_int (r);
474 if (n < 0 || n > 255)
475 error (r, _("Bad string length %d."), n);
486 /* Reads a string into BUF, which must have room for 256
488 Returns the number of bytes read.
491 read_bytes (struct pfm_reader *r, uint8_t *buf)
493 int n = read_int (r);
494 if (n < 0 || n > 255)
495 error (r, _("Bad string length %d."), n);
507 /* Reads a string and returns a copy of it allocated from R's
510 read_pool_string (struct pfm_reader *r)
513 read_string (r, string);
514 return pool_strdup (r->pool, string);
517 /* Reads the 464-byte file header. */
519 read_header (struct pfm_reader *r)
524 /* Read and ignore vanity splash strings. */
525 for (i = 0; i < 200; i++)
528 /* Skip the first 64 characters of the translation table.
529 We don't care about these. They are probably all set to
530 '0', marking them as untranslatable, and that would screw
531 up our actual translation of the real '0'. */
532 for (i = 0; i < 64; i++)
535 /* Read the rest of the translation table. */
536 trans = pool_malloc (r->pool, 256);
537 memset (trans, 0, 256);
546 trans[c] = portable_to_local[i];
549 /* Set up the translation table, then read the first
550 translated character. */
554 /* Skip and verify signature. */
555 for (i = 0; i < 8; i++)
556 if (!match (r, "SPSSPORT"[i]))
558 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
559 longjmp (r->bail_out, 1);
563 /* Reads the version and date info record, as well as product and
564 subproduct identification records if present. */
566 read_version_data (struct pfm_reader *r, struct any_read_info *info)
568 static const char empty_string[] = "";
570 const char *product, *subproduct;
575 error (r, _("Unrecognized version code `%c'."), r->cc);
576 date = read_pool_string (r);
577 time = read_pool_string (r);
578 product = match (r, '1') ? read_pool_string (r) : empty_string;
581 /* Skip "author" field. */
582 read_pool_string (r);
584 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
587 if (strlen (date) != 8)
588 error (r, _("Bad date string length %zu."), strlen (date));
589 if (strlen (time) != 6)
590 error (r, _("Bad time string length %zu."), strlen (time));
592 /* Save file info. */
595 memset (info, 0, sizeof *info);
597 info->float_format = FLOAT_NATIVE_DOUBLE;
598 info->integer_format = INTEGER_NATIVE;
599 info->compression = ANY_COMP_NONE;
603 info->creation_date = xmalloc (11);
604 for (i = 0; i < 8; i++)
606 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
607 info->creation_date[map[i]] = date[i];
609 info->creation_date[2] = info->creation_date[5] = ' ';
610 info->creation_date[10] = '\0';
613 info->creation_time = xmalloc (9);
614 for (i = 0; i < 6; i++)
616 static const int map[] = {0, 1, 3, 4, 6, 7};
617 info->creation_time[map[i]] = time[i];
619 info->creation_time[2] = info->creation_time[5] = ' ';
620 info->creation_time[8] = 0;
623 info->product = xstrdup (product);
624 info->product_ext = xstrdup (subproduct);
628 /* Translates a format specification read from portable file R as
629 the three integers INTS into a normal format specifier FORMAT,
630 checking that the format is appropriate for variable V. */
631 static struct fmt_spec
632 convert_format (struct pfm_reader *r, const int portable_format[3],
633 struct variable *v, bool *report_error)
635 struct fmt_spec format;
638 if (!fmt_from_io (portable_format[0], &format.type))
641 warning (r, _("%s: Bad format specifier byte (%d). Variable "
642 "will be assigned a default format."),
643 var_get_name (v), portable_format[0]);
647 format.w = portable_format[1];
648 format.d = portable_format[2];
651 ok = (fmt_check_output (&format)
652 && fmt_check_width_compat (&format, var_get_width (v)));
659 char fmt_string[FMT_STRING_LEN_MAX + 1];
660 fmt_to_string (&format, fmt_string);
661 if (var_is_numeric (v))
662 warning (r, _("Numeric variable %s has invalid format "
664 var_get_name (v), fmt_string);
666 warning (r, _("String variable %s with width %d has "
667 "invalid format specifier %s."),
668 var_get_name (v), var_get_width (v), fmt_string);
676 *report_error = false;
677 return fmt_default_for_width (var_get_width (v));
680 static void parse_value (struct pfm_reader *, int width, union value *);
682 /* Read information on all the variables. */
684 read_variables (struct pfm_reader *r, struct dictionary *dict)
686 char *weight_name = NULL;
690 error (r, _("Expected variable count record."));
692 r->var_cnt = read_int (r);
694 error (r, _("Invalid number of variables %d."), r->var_cnt);
701 weight_name = read_pool_string (r);
702 if (strlen (weight_name) > SHORT_NAME_LEN)
703 error (r, _("Weight variable name (%s) truncated."), weight_name);
706 for (i = 0; i < r->var_cnt; i++)
712 struct missing_values miss;
713 struct fmt_spec print, write;
714 bool report_error = true;
718 error (r, _("Expected variable record."));
720 width = read_int (r);
722 error (r, _("Invalid variable width %d."), width);
724 read_string (r, name);
725 for (j = 0; j < 6; j++)
726 fmt[j] = read_int (r);
728 if (!dict_id_is_valid (dict, name, false)
729 || *name == '#' || *name == '$')
730 error (r, _("Invalid variable name `%s' in position %d."), name, i);
731 str_uppercase (name);
733 if (width < 0 || width > 255)
734 error (r, _("Bad width %d for variable %s."), width, name);
736 v = dict_create_var (dict, name, width);
742 char try_name[8 + 1 + INT_STRLEN_BOUND (i) + 1];
743 sprintf (try_name, "%s_%lu", name, i);
744 v = dict_create_var (dict, try_name, width);
748 warning (r, _("Duplicate variable name %s in position %d renamed "
749 "to %s."), name, i, var_get_name (v));
752 print = convert_format (r, &fmt[0], v, &report_error);
753 write = convert_format (r, &fmt[3], v, &report_error);
754 var_set_print_format (v, &print);
755 var_set_write_format (v, &write);
757 /* Range missing values. */
758 mv_init (&miss, width);
761 double x = read_float (r);
762 double y = read_float (r);
763 mv_add_range (&miss, x, y);
765 else if (match (r, 'A'))
766 mv_add_range (&miss, read_float (r), HIGHEST);
767 else if (match (r, '9'))
768 mv_add_range (&miss, LOWEST, read_float (r));
770 /* Single missing values. */
771 while (match (r, '8'))
773 int mv_width = MIN (width, 8);
776 parse_value (r, mv_width, &value);
777 value_resize (&value, mv_width, width);
778 mv_add_value (&miss, &value);
779 value_destroy (&value, width);
782 var_set_missing_values (v, &miss);
788 read_string (r, label);
789 var_set_label (v, label); /* XXX */
793 if (weight_name != NULL)
795 struct variable *weight_var = dict_lookup_var (dict, weight_name);
796 if (weight_var == NULL)
797 error (r, _("Weighting variable %s not present in dictionary."),
800 dict_set_weight (dict, weight_var);
804 /* Parse a value of with WIDTH into value V. */
806 parse_value (struct pfm_reader *r, int width, union value *v)
808 value_init (v, width);
812 size_t n_bytes = read_bytes (r, buf);
813 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
816 v->f = read_float (r);
819 /* Parse a value label record and return success. */
821 read_value_label (struct pfm_reader *r, struct dictionary *dict)
833 v = pool_nalloc (r->pool, nv, sizeof *v);
834 for (i = 0; i < nv; i++)
837 read_string (r, name);
839 v[i] = dict_lookup_var (dict, name);
841 error (r, _("Unknown variable %s while parsing value labels."), name);
843 if (var_get_type (v[0]) != var_get_type (v[i]))
844 error (r, _("Cannot assign value labels to %s and %s, which "
845 "have different variable types."),
846 var_get_name (v[0]), var_get_name (v[i]));
849 n_labels = read_int (r);
850 for (i = 0; i < n_labels; i++)
856 parse_value (r, var_get_width (v[0]), &val);
857 read_string (r, label);
859 /* Assign the value label to each variable. */
860 for (j = 0; j < nv; j++)
861 var_replace_value_label (v[j], &val, label);
863 value_destroy (&val, var_get_width (v[0]));
867 /* Reads a set of documents from portable file R into DICT. */
869 read_documents (struct pfm_reader *r, struct dictionary *dict)
874 line_cnt = read_int (r);
875 for (i = 0; i < line_cnt; i++)
878 read_string (r, line);
879 dict_add_document_line (dict, line, false);
883 /* Reads and returns one case from portable file R. Returns a
884 null pointer on failure. */
885 static struct ccase *
886 por_file_casereader_read (struct casereader *reader, void *r_)
888 struct pfm_reader *r = r_;
889 struct ccase *volatile c;
892 c = case_create (r->proto);
893 setjmp (r->bail_out);
896 casereader_force_error (reader);
901 /* Check for end of file. */
908 for (i = 0; i < r->var_cnt; i++)
910 int width = caseproto_get_width (r->proto, i);
913 case_data_rw_idx (c, i)->f = read_float (r);
917 size_t n_bytes = read_bytes (r, buf);
918 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
925 /* Returns true if FILE is an SPSS portable file,
928 pfm_detect (FILE *file)
930 unsigned char header[464];
932 int cooked_cnt, raw_cnt, line_len;
935 cooked_cnt = raw_cnt = 0;
937 while (cooked_cnt < sizeof header)
940 if (c == EOF || raw_cnt++ > 512)
944 while (line_len < 80 && cooked_cnt < sizeof header)
946 header[cooked_cnt++] = ' ';
953 header[cooked_cnt++] = c;
958 memset (trans, 0, 256);
959 for (i = 64; i < 256; i++)
961 unsigned char c = header[i + 200];
963 trans[c] = portable_to_local[i];
966 for (i = 0; i < 8; i++)
967 if (trans[header[i + 456]] != "SPSSPORT"[i])
973 static const struct casereader_class por_file_casereader_class =
975 por_file_casereader_read,
976 por_file_casereader_destroy,
981 const struct any_reader_class por_file_reader_class =
983 N_("SPSS Portable File"),
988 NULL, /* get_strings */