1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #include "data/any-reader.h"
29 #include "data/casereader-provider.h"
30 #include "data/casereader.h"
31 #include "data/dictionary.h"
32 #include "data/file-handle-def.h"
33 #include "data/file-name.h"
34 #include "data/format.h"
35 #include "data/missing-values.h"
36 #include "data/short-names.h"
37 #include "data/value-labels.h"
38 #include "data/variable.h"
39 #include "libpspp/compiler.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/minmax.h"
47 #include "gl/xalloc.h"
48 #include "gl/xmemdup0.h"
51 #define _(msgid) gettext (msgid)
52 #define N_(msgid) (msgid)
54 /* portable_to_local[PORTABLE] translates the given portable
55 character into the local character set. */
56 static const char portable_to_local[256] =
59 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
60 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
64 /* Portable file reader. */
67 struct any_reader any_reader;
68 struct pool *pool; /* All the portable file state. */
70 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct dictionary *dict;
73 struct any_read_info info;
74 struct file_handle *fh; /* File handle. */
75 struct fh_lock *lock; /* Read lock for file. */
76 FILE *file; /* File stream. */
77 int line_length; /* Number of characters so far on this line. */
78 char cc; /* Current character. */
79 char *trans; /* 256-byte character set translation table. */
80 int var_cnt; /* Number of variables. */
81 int weight_index; /* 0-based index of weight variable, or -1. */
82 struct caseproto *proto; /* Format of output cases. */
83 bool ok; /* Set false on I/O error. */
86 static const struct casereader_class por_file_casereader_class;
88 static struct pfm_reader *
89 pfm_reader_cast (const struct any_reader *r_)
91 assert (r_->klass == &por_file_reader_class);
92 return UP_CAST (r_, struct pfm_reader, any_reader);
96 error (struct pfm_reader *r, const char *msg,...)
100 /* Displays MSG as an error message and aborts reading the
101 portable file via longjmp(). */
103 error (struct pfm_reader *r, const char *msg, ...)
108 ds_init_empty (&text);
109 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
110 fh_get_file_name (r->fh), (long long int) ftello (r->file));
111 va_start (args, msg);
112 ds_put_vformat (&text, msg, args);
116 .category = MSG_C_GENERAL,
117 .severity = MSG_S_ERROR,
118 .text = ds_cstr (&text),
124 longjmp (r->bail_out, 1);
127 /* Displays MSG as an warning for the current position in
128 portable file reader R. */
130 warning (struct pfm_reader *r, const char *msg, ...)
135 ds_init_empty (&text);
136 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
137 fh_get_file_name (r->fh), (long long int) ftello (r->file));
138 va_start (args, msg);
139 ds_put_vformat (&text, msg, args);
143 .category = MSG_C_GENERAL,
144 .severity = MSG_S_WARNING,
145 .text = ds_cstr (&text),
150 /* Close and destroy R.
151 Returns false if an error was detected on R, true otherwise. */
153 pfm_close (struct any_reader *r_)
155 struct pfm_reader *r = pfm_reader_cast (r_);
158 dict_unref (r->dict);
159 any_read_info_destroy (&r->info);
162 if (fn_close (r->fh, r->file) == EOF)
164 msg (ME, _("Error closing portable file `%s': %s."),
165 fh_get_file_name (r->fh), strerror (errno));
175 pool_destroy (r->pool);
180 /* Closes portable file reader R, after we're done with it. */
182 por_file_casereader_destroy (struct casereader *reader, void *r_)
184 struct pfm_reader *r = r_;
185 if (!pfm_close (&r->any_reader))
186 casereader_force_error (reader);
189 /* Read a single character into cur_char. */
191 advance (struct pfm_reader *r)
195 /* Read the next character from the file.
196 Ignore carriage returns entirely.
197 Mostly ignore new-lines, but if a new-line occurs before the
198 line has reached 80 bytes in length, then treat the
199 "missing" bytes as spaces. */
202 while ((c = getc (r->file)) == '\r')
207 if (r->line_length < 80)
210 ungetc ('\n', r->file);
216 error (r, _("unexpected end of file"));
218 if (r->trans != NULL)
224 /* Skip a single character if present, and return whether it was
227 match (struct pfm_reader *r, int c)
238 static void read_header (struct pfm_reader *);
239 static void read_version_data (struct pfm_reader *, struct any_read_info *);
240 static void read_variables (struct pfm_reader *, struct dictionary *);
241 static void read_value_label (struct pfm_reader *, struct dictionary *);
242 static void read_documents (struct pfm_reader *, struct dictionary *);
244 /* Reads the dictionary from file with handle H, and returns it in a
245 dictionary structure. This dictionary may be modified in order to
246 rename, reorder, and delete variables, etc. */
247 static struct any_reader *
248 pfm_open (struct file_handle *fh)
250 struct pool *volatile pool = NULL;
251 struct pfm_reader *volatile r = NULL;
253 /* Create and initialize reader. */
254 pool = pool_create ();
255 r = pool_alloc (pool, sizeof *r);
256 r->any_reader.klass = &por_file_reader_class;
257 r->dict = dict_create (get_default_encoding ());
258 memset (&r->info, 0, sizeof r->info);
264 r->weight_index = -1;
269 if (setjmp (r->bail_out))
273 /* TRANSLATORS: this fragment will be interpolated into
274 messages in fh_lock() that identify types of files. */
275 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
280 r->file = fn_open (r->fh, "rb");
283 msg (ME, _("An error occurred while opening `%s' for reading "
284 "as a portable file: %s."),
285 fh_get_file_name (r->fh), strerror (errno));
289 /* Read header, version, date info, product id, variables. */
291 read_version_data (r, &r->info);
292 read_variables (r, r->dict);
294 /* Read value labels. */
295 while (match (r, 'D'))
296 read_value_label (r, r->dict);
298 /* Read documents. */
300 read_documents (r, r->dict);
302 /* Check that we've made it to the data. */
304 error (r, _("Data record expected."));
306 r->proto = caseproto_ref_pool (dict_get_proto (r->dict), r->pool);
307 return &r->any_reader;
310 pfm_close (&r->any_reader);
314 static struct casereader *
315 pfm_decode (struct any_reader *r_, const char *encoding UNUSED,
316 struct dictionary **dictp, struct any_read_info *info)
318 struct pfm_reader *r = pfm_reader_cast (r_);
326 memset (&r->info, 0, sizeof r->info);
329 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
330 &por_file_casereader_class, r);
333 /* Returns the value of base-30 digit C,
334 or -1 if C is not a base-30 digit. */
336 base_30_value (unsigned char c)
338 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
339 const char *p = strchr (base_30_digits, c);
340 return p != NULL ? p - base_30_digits : -1;
343 /* Read a floating point value and return its value. */
345 read_float (struct pfm_reader *r)
349 bool got_dot = false; /* Seen a decimal point? */
350 bool got_digit = false; /* Seen any digits? */
351 bool negative = false; /* Number is negative? */
353 /* Skip leading spaces. */
354 while (match (r, ' '))
357 /* `*' indicates system-missing. */
360 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
364 negative = match (r, '-');
367 int digit = base_30_value (r->cc);
372 /* Make sure that multiplication by 30 will not overflow. */
373 if (num > DBL_MAX * (1. / 30.))
374 /* The value of the digit doesn't matter, since we have already
375 gotten as many digits as can be represented in a `double'.
376 This doesn't necessarily mean the result will overflow.
377 The exponent may reduce it to within range.
379 We just need to record that there was another
380 digit so that we can multiply by 10 later. */
383 num = (num * 30.0) + digit;
385 /* Keep track of the number of digits after the decimal point.
386 If we just divided by 30 here, we would lose precision. */
390 else if (!got_dot && r->cc == '.')
391 /* Record that we have found the decimal point. */
394 /* Any other character terminates the number. */
400 /* Check that we had some digits. */
402 error (r, _("Number expected."));
404 /* Get exponent if any. */
405 if (r->cc == '+' || r->cc == '-')
408 bool negative_exponent = r->cc == '-';
411 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
413 if (exp > LONG_MAX / 30)
418 exp = exp * 30 + digit;
421 /* We don't check whether there were actually any digits, but we
423 if (negative_exponent)
428 /* Numbers must end with `/'. */
430 error (r, _("Missing numeric terminator."));
432 /* Multiply `num' by 30 to the `exponent' power, checking for
435 num *= pow (30.0, (double) exponent);
436 else if (exponent > 0)
438 if (num > DBL_MAX * pow (30.0, (double) -exponent))
441 num *= pow (30.0, (double) exponent);
444 return negative ? -num : num;
447 /* Read an integer and return its value. */
449 read_int (struct pfm_reader *r)
451 double f = read_float (r);
452 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
453 error (r, _("Invalid integer."));
457 /* Reads a string into BUF, which must have room for 256
460 read_string (struct pfm_reader *r, char *buf)
462 int n = read_int (r);
463 if (n < 0 || n > 255)
464 error (r, _("Bad string length %d."), n);
475 /* Reads a string into BUF, which must have room for 256
477 Returns the number of bytes read.
480 read_bytes (struct pfm_reader *r, uint8_t *buf)
482 int n = read_int (r);
483 if (n < 0 || n > 255)
484 error (r, _("Bad string length %d."), n);
496 /* Reads a string and returns a copy of it allocated from R's
499 read_pool_string (struct pfm_reader *r)
502 read_string (r, string);
503 return pool_strdup (r->pool, string);
506 /* Reads the 464-byte file header. */
508 read_header (struct pfm_reader *r)
513 /* Read and ignore vanity splash strings. */
514 for (i = 0; i < 200; i++)
517 /* Skip the first 64 characters of the translation table.
518 We don't care about these. They are probably all set to
519 '0', marking them as untranslatable, and that would screw
520 up our actual translation of the real '0'. */
521 for (i = 0; i < 64; i++)
524 /* Read the rest of the translation table. */
525 trans = pool_malloc (r->pool, 256);
526 memset (trans, 0, 256);
535 trans[c] = portable_to_local[i];
538 /* Set up the translation table, then read the first
539 translated character. */
543 /* Skip and verify signature. */
544 for (i = 0; i < 8; i++)
545 if (!match (r, "SPSSPORT"[i]))
547 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
548 longjmp (r->bail_out, 1);
552 /* Reads the version and date info record, as well as product and
553 subproduct identification records if present. */
555 read_version_data (struct pfm_reader *r, struct any_read_info *info)
557 static const char empty_string[] = "";
559 const char *product, *subproduct;
564 error (r, _("Unrecognized version code `%c'."), r->cc);
565 date = read_pool_string (r);
566 time = read_pool_string (r);
567 product = match (r, '1') ? read_pool_string (r) : empty_string;
570 /* Skip "author" field. */
571 read_pool_string (r);
573 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
576 if (strlen (date) != 8)
577 error (r, _("Bad date string length %zu."), strlen (date));
578 if (strlen (time) != 6)
579 error (r, _("Bad time string length %zu."), strlen (time));
581 /* Save file info. */
584 memset (info, 0, sizeof *info);
586 info->float_format = FLOAT_NATIVE_DOUBLE;
587 info->integer_format = INTEGER_NATIVE;
588 info->compression = ANY_COMP_NONE;
592 info->creation_date = xmalloc (11);
593 for (i = 0; i < 8; i++)
595 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
596 info->creation_date[map[i]] = date[i];
598 info->creation_date[2] = info->creation_date[5] = ' ';
599 info->creation_date[10] = '\0';
602 info->creation_time = xmalloc (9);
603 for (i = 0; i < 6; i++)
605 static const int map[] = {0, 1, 3, 4, 6, 7};
606 info->creation_time[map[i]] = time[i];
608 info->creation_time[2] = info->creation_time[5] = ' ';
609 info->creation_time[8] = 0;
612 info->product = xstrdup (product);
613 info->product_ext = xstrdup (subproduct);
617 /* Translates a format specification read from portable file R as
618 the three integers INTS into a normal format specifier FORMAT,
619 checking that the format is appropriate for variable V. */
620 static struct fmt_spec
621 convert_format (struct pfm_reader *r, const int portable_format[3],
622 struct variable *v, bool *report_error)
624 struct fmt_spec format;
627 if (!fmt_from_io (portable_format[0], &format.type))
630 warning (r, _("%s: Bad format specifier byte (%d). Variable "
631 "will be assigned a default format."),
632 var_get_name (v), portable_format[0]);
636 format.w = portable_format[1];
637 format.d = portable_format[2];
640 ok = (fmt_check_output (&format)
641 && fmt_check_width_compat (&format, var_get_width (v)));
648 char fmt_string[FMT_STRING_LEN_MAX + 1];
649 fmt_to_string (&format, fmt_string);
650 if (var_is_numeric (v))
651 warning (r, _("Numeric variable %s has invalid format "
653 var_get_name (v), fmt_string);
655 warning (r, _("String variable %s with width %d has "
656 "invalid format specifier %s."),
657 var_get_name (v), var_get_width (v), fmt_string);
665 *report_error = false;
666 return fmt_default_for_width (var_get_width (v));
669 static void parse_value (struct pfm_reader *, int width, union value *);
671 /* Read information on all the variables. */
673 read_variables (struct pfm_reader *r, struct dictionary *dict)
675 char *weight_name = NULL;
679 error (r, _("Expected variable count record."));
681 r->var_cnt = read_int (r);
683 error (r, _("Invalid number of variables %d."), r->var_cnt);
690 weight_name = read_pool_string (r);
691 if (strlen (weight_name) > SHORT_NAME_LEN)
692 error (r, _("Weight variable name (%s) truncated."), weight_name);
695 for (i = 0; i < r->var_cnt; i++)
701 struct missing_values miss;
702 struct fmt_spec print, write;
703 bool report_error = true;
707 error (r, _("Expected variable record."));
709 width = read_int (r);
711 error (r, _("Invalid variable width %d."), width);
713 read_string (r, name);
714 for (j = 0; j < 6; j++)
715 fmt[j] = read_int (r);
717 if (!dict_id_is_valid (dict, name, false)
718 || *name == '#' || *name == '$')
719 error (r, _("Invalid variable name `%s' in position %d."), name, i);
720 str_uppercase (name);
722 if (width < 0 || width > 255)
723 error (r, _("Bad width %d for variable %s."), width, name);
725 v = dict_create_var (dict, name, width);
731 char *try_name = xasprintf ("%s_%lu", name, i);
732 v = dict_create_var (dict, try_name, width);
737 warning (r, _("Duplicate variable name %s in position %d renamed "
738 "to %s."), name, i, var_get_name (v));
741 print = convert_format (r, &fmt[0], v, &report_error);
742 write = convert_format (r, &fmt[3], v, &report_error);
743 var_set_print_format (v, &print);
744 var_set_write_format (v, &write);
746 /* Range missing values. */
747 mv_init (&miss, width);
750 double x = read_float (r);
751 double y = read_float (r);
752 mv_add_range (&miss, x, y);
754 else if (match (r, 'A'))
755 mv_add_range (&miss, read_float (r), HIGHEST);
756 else if (match (r, '9'))
757 mv_add_range (&miss, LOWEST, read_float (r));
759 /* Single missing values. */
760 while (match (r, '8'))
762 int mv_width = MIN (width, 8);
765 parse_value (r, mv_width, &value);
766 value_resize (&value, mv_width, width);
767 mv_add_value (&miss, &value);
768 value_destroy (&value, width);
771 var_set_missing_values (v, &miss);
777 read_string (r, label);
778 var_set_label (v, label); /* XXX */
782 if (weight_name != NULL)
784 struct variable *weight_var = dict_lookup_var (dict, weight_name);
785 if (weight_var == NULL)
786 error (r, _("Weighting variable %s not present in dictionary."),
789 dict_set_weight (dict, weight_var);
793 /* Parse a value of with WIDTH into value V. */
795 parse_value (struct pfm_reader *r, int width, union value *v)
797 value_init (v, width);
801 size_t n_bytes = read_bytes (r, buf);
802 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
805 v->f = read_float (r);
808 /* Parse a value label record and return success. */
810 read_value_label (struct pfm_reader *r, struct dictionary *dict)
822 v = pool_nalloc (r->pool, nv, sizeof *v);
823 for (i = 0; i < nv; i++)
826 read_string (r, name);
828 v[i] = dict_lookup_var (dict, name);
830 error (r, _("Unknown variable %s while parsing value labels."), name);
832 if (var_get_type (v[0]) != var_get_type (v[i]))
833 error (r, _("Cannot assign value labels to %s and %s, which "
834 "have different variable types."),
835 var_get_name (v[0]), var_get_name (v[i]));
838 n_labels = read_int (r);
839 for (i = 0; i < n_labels; i++)
845 parse_value (r, var_get_width (v[0]), &val);
846 read_string (r, label);
848 /* Assign the value label to each variable. */
849 for (j = 0; j < nv; j++)
850 var_replace_value_label (v[j], &val, label);
852 value_destroy (&val, var_get_width (v[0]));
856 /* Reads a set of documents from portable file R into DICT. */
858 read_documents (struct pfm_reader *r, struct dictionary *dict)
863 line_cnt = read_int (r);
864 for (i = 0; i < line_cnt; i++)
867 read_string (r, line);
868 dict_add_document_line (dict, line, false);
872 /* Reads and returns one case from portable file R. Returns a
873 null pointer on failure. */
874 static struct ccase *
875 por_file_casereader_read (struct casereader *reader, void *r_)
877 struct pfm_reader *r = r_;
878 struct ccase *volatile c;
881 c = case_create (r->proto);
882 setjmp (r->bail_out);
885 casereader_force_error (reader);
890 /* Check for end of file. */
897 for (i = 0; i < r->var_cnt; i++)
899 int width = caseproto_get_width (r->proto, i);
902 case_data_rw_idx (c, i)->f = read_float (r);
906 size_t n_bytes = read_bytes (r, buf);
907 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
914 /* Detects whether FILE is an SPSS portable file. Returns 1 if so, 0 if not,
915 and a negative errno value if there is an error reading FILE. */
917 pfm_detect (FILE *file)
919 unsigned char header[464];
921 int cooked_cnt, raw_cnt, line_len;
924 cooked_cnt = raw_cnt = 0;
926 while (cooked_cnt < sizeof header)
929 if (c == EOF || raw_cnt++ > 512)
930 return ferror (file) ? -errno : 0;
933 while (line_len < 80 && cooked_cnt < sizeof header)
935 header[cooked_cnt++] = ' ';
942 header[cooked_cnt++] = c;
947 memset (trans, 0, 256);
948 for (i = 64; i < 256; i++)
950 unsigned char c = header[i + 200];
952 trans[c] = portable_to_local[i];
955 for (i = 0; i < 8; i++)
956 if (trans[header[i + 456]] != "SPSSPORT"[i])
962 static const struct casereader_class por_file_casereader_class =
964 por_file_casereader_read,
965 por_file_casereader_destroy,
970 const struct any_reader_class por_file_reader_class =
972 N_("SPSS Portable File"),
977 NULL, /* get_strings */