1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/por-file-reader.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/short-names.h"
38 #include "data/value-labels.h"
39 #include "data/variable.h"
40 #include "libpspp/compiler.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/intprops.h"
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
51 #define _(msgid) gettext (msgid)
52 #define N_(msgid) (msgid)
54 /* portable_to_local[PORTABLE] translates the given portable
55 character into the local character set. */
56 static const char portable_to_local[256] =
59 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
60 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
64 /* Portable file reader. */
67 struct pool *pool; /* All the portable file state. */
69 jmp_buf bail_out; /* longjmp() target for error handling. */
71 struct file_handle *fh; /* File handle. */
72 struct fh_lock *lock; /* Read lock for file. */
73 FILE *file; /* File stream. */
74 int line_length; /* Number of characters so far on this line. */
75 char cc; /* Current character. */
76 char *trans; /* 256-byte character set translation table. */
77 int var_cnt; /* Number of variables. */
78 int weight_index; /* 0-based index of weight variable, or -1. */
79 struct caseproto *proto; /* Format of output cases. */
80 bool ok; /* Set false on I/O error. */
83 static const struct casereader_class por_file_casereader_class;
86 error (struct pfm_reader *r, const char *msg,...)
90 /* Displays MSG as an error message and aborts reading the
91 portable file via longjmp(). */
93 error (struct pfm_reader *r, const char *msg, ...)
99 ds_init_empty (&text);
100 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
101 fh_get_file_name (r->fh), (long long int) ftello (r->file));
102 va_start (args, msg);
103 ds_put_vformat (&text, msg, args);
106 m.category = MSG_C_GENERAL;
107 m.severity = MSG_S_ERROR;
113 m.text = ds_cstr (&text);
119 longjmp (r->bail_out, 1);
122 /* Displays MSG as an warning for the current position in
123 portable file reader R. */
125 warning (struct pfm_reader *r, const char *msg, ...)
131 ds_init_empty (&text);
132 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
133 fh_get_file_name (r->fh), (long long int) ftello (r->file));
134 va_start (args, msg);
135 ds_put_vformat (&text, msg, args);
138 m.category = MSG_C_GENERAL;
139 m.severity = MSG_S_WARNING;
145 m.text = ds_cstr (&text);
150 /* Close and destroy R.
151 Returns false if an error was detected on R, true otherwise. */
153 close_reader (struct pfm_reader *r)
161 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
163 msg (ME, _("Error closing portable file `%s': %s."),
164 fh_get_file_name (r->fh), strerror (errno));
174 pool_destroy (r->pool);
179 /* Closes portable file reader R, after we're done with it. */
181 por_file_casereader_destroy (struct casereader *reader, void *r_)
183 struct pfm_reader *r = r_;
184 if (!close_reader (r))
185 casereader_force_error (reader);
188 /* Read a single character into cur_char. */
190 advance (struct pfm_reader *r)
194 /* Read the next character from the file.
195 Ignore carriage returns entirely.
196 Mostly ignore new-lines, but if a new-line occurs before the
197 line has reached 80 bytes in length, then treat the
198 "missing" bytes as spaces. */
201 while ((c = getc (r->file)) == '\r')
206 if (r->line_length < 80)
209 ungetc ('\n', r->file);
215 error (r, _("unexpected end of file"));
217 if (r->trans != NULL)
223 /* Skip a single character if present, and return whether it was
226 match (struct pfm_reader *r, int c)
237 static void read_header (struct pfm_reader *);
238 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
239 static void read_variables (struct pfm_reader *, struct dictionary *);
240 static void read_value_label (struct pfm_reader *, struct dictionary *);
241 static void read_documents (struct pfm_reader *, struct dictionary *);
243 /* Reads the dictionary from file with handle H, and returns it in a
244 dictionary structure. This dictionary may be modified in order to
245 rename, reorder, and delete variables, etc. */
247 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
248 struct pfm_read_info *info)
250 struct pool *volatile pool = NULL;
251 struct pfm_reader *volatile r = NULL;
253 *dict = dict_create ();
255 /* Create and initialize reader. */
256 pool = pool_create ();
257 r = pool_alloc (pool, sizeof *r);
263 r->weight_index = -1;
268 if (setjmp (r->bail_out))
272 /* TRANSLATORS: this fragment will be interpolated into
273 messages in fh_lock() that identify types of files. */
274 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
279 r->file = fn_open (fh_get_file_name (r->fh), "rb");
282 msg (ME, _("An error occurred while opening `%s' for reading "
283 "as a portable file: %s."),
284 fh_get_file_name (r->fh), strerror (errno));
288 /* Read header, version, date info, product id, variables. */
290 read_version_data (r, info);
291 read_variables (r, *dict);
293 /* Read value labels. */
294 while (match (r, 'D'))
295 read_value_label (r, *dict);
297 /* Read documents. */
299 read_documents (r, *dict);
301 /* Check that we've made it to the data. */
303 error (r, _("Data record expected."));
305 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
306 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
307 &por_file_casereader_class, r);
311 dict_destroy (*dict);
316 /* Returns the value of base-30 digit C,
317 or -1 if C is not a base-30 digit. */
319 base_30_value (unsigned char c)
321 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
322 const char *p = strchr (base_30_digits, c);
323 return p != NULL ? p - base_30_digits : -1;
326 /* Read a floating point value and return its value. */
328 read_float (struct pfm_reader *r)
332 bool got_dot = false; /* Seen a decimal point? */
333 bool got_digit = false; /* Seen any digits? */
334 bool negative = false; /* Number is negative? */
336 /* Skip leading spaces. */
337 while (match (r, ' '))
340 /* `*' indicates system-missing. */
343 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
347 negative = match (r, '-');
350 int digit = base_30_value (r->cc);
355 /* Make sure that multiplication by 30 will not overflow. */
356 if (num > DBL_MAX * (1. / 30.))
357 /* The value of the digit doesn't matter, since we have already
358 gotten as many digits as can be represented in a `double'.
359 This doesn't necessarily mean the result will overflow.
360 The exponent may reduce it to within range.
362 We just need to record that there was another
363 digit so that we can multiply by 10 later. */
366 num = (num * 30.0) + digit;
368 /* Keep track of the number of digits after the decimal point.
369 If we just divided by 30 here, we would lose precision. */
373 else if (!got_dot && r->cc == '.')
374 /* Record that we have found the decimal point. */
377 /* Any other character terminates the number. */
383 /* Check that we had some digits. */
385 error (r, _("Number expected."));
387 /* Get exponent if any. */
388 if (r->cc == '+' || r->cc == '-')
391 bool negative_exponent = r->cc == '-';
394 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
396 if (exp > LONG_MAX / 30)
401 exp = exp * 30 + digit;
404 /* We don't check whether there were actually any digits, but we
406 if (negative_exponent)
411 /* Numbers must end with `/'. */
413 error (r, _("Missing numeric terminator."));
415 /* Multiply `num' by 30 to the `exponent' power, checking for
418 num *= pow (30.0, (double) exponent);
419 else if (exponent > 0)
421 if (num > DBL_MAX * pow (30.0, (double) -exponent))
424 num *= pow (30.0, (double) exponent);
427 return negative ? -num : num;
430 /* Read an integer and return its value. */
432 read_int (struct pfm_reader *r)
434 double f = read_float (r);
435 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
436 error (r, _("Invalid integer."));
440 /* Reads a string into BUF, which must have room for 256
443 read_string (struct pfm_reader *r, char *buf)
445 int n = read_int (r);
446 if (n < 0 || n > 255)
447 error (r, _("Bad string length %d."), n);
458 /* Reads a string into BUF, which must have room for 256
460 Returns the number of bytes read.
463 read_bytes (struct pfm_reader *r, uint8_t *buf)
465 int n = read_int (r);
466 if (n < 0 || n > 255)
467 error (r, _("Bad string length %d."), n);
479 /* Reads a string and returns a copy of it allocated from R's
482 read_pool_string (struct pfm_reader *r)
485 read_string (r, string);
486 return pool_strdup (r->pool, string);
489 /* Reads the 464-byte file header. */
491 read_header (struct pfm_reader *r)
496 /* Read and ignore vanity splash strings. */
497 for (i = 0; i < 200; i++)
500 /* Skip the first 64 characters of the translation table.
501 We don't care about these. They are probably all set to
502 '0', marking them as untranslatable, and that would screw
503 up our actual translation of the real '0'. */
504 for (i = 0; i < 64; i++)
507 /* Read the rest of the translation table. */
508 trans = pool_malloc (r->pool, 256);
509 memset (trans, 0, 256);
518 trans[c] = portable_to_local[i];
521 /* Set up the translation table, then read the first
522 translated character. */
526 /* Skip and verify signature. */
527 for (i = 0; i < 8; i++)
528 if (!match (r, "SPSSPORT"[i]))
530 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
531 longjmp (r->bail_out, 1);
535 /* Reads the version and date info record, as well as product and
536 subproduct identification records if present. */
538 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
540 static const char empty_string[] = "";
542 const char *product, *author, *subproduct;
547 error (r, _("Unrecognized version code `%c'."), r->cc);
548 date = read_pool_string (r);
549 time = read_pool_string (r);
550 product = match (r, '1') ? read_pool_string (r) : empty_string;
551 author = match (r, '2') ? read_pool_string (r) : empty_string;
552 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
555 if (strlen (date) != 8)
556 error (r, _("Bad date string length %zu."), strlen (date));
557 if (strlen (time) != 6)
558 error (r, _("Bad time string length %zu."), strlen (time));
560 /* Save file info. */
564 for (i = 0; i < 8; i++)
566 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
567 info->creation_date[map[i]] = date[i];
569 info->creation_date[2] = info->creation_date[5] = ' ';
570 info->creation_date[10] = 0;
573 for (i = 0; i < 6; i++)
575 static const int map[] = {0, 1, 3, 4, 6, 7};
576 info->creation_time[map[i]] = time[i];
578 info->creation_time[2] = info->creation_time[5] = ' ';
579 info->creation_time[8] = 0;
582 str_copy_trunc (info->product, sizeof info->product, product);
583 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
587 /* Translates a format specification read from portable file R as
588 the three integers INTS into a normal format specifier FORMAT,
589 checking that the format is appropriate for variable V. */
590 static struct fmt_spec
591 convert_format (struct pfm_reader *r, const int portable_format[3],
592 struct variable *v, bool *report_error)
594 struct fmt_spec format;
597 if (!fmt_from_io (portable_format[0], &format.type))
600 warning (r, _("%s: Bad format specifier byte (%d). Variable "
601 "will be assigned a default format."),
602 var_get_name (v), portable_format[0]);
606 format.w = portable_format[1];
607 format.d = portable_format[2];
610 ok = (fmt_check_output (&format)
611 && fmt_check_width_compat (&format, var_get_width (v)));
618 char fmt_string[FMT_STRING_LEN_MAX + 1];
619 fmt_to_string (&format, fmt_string);
620 if (var_is_numeric (v))
621 warning (r, _("Numeric variable %s has invalid format "
623 var_get_name (v), fmt_string);
625 warning (r, _("String variable %s with width %d has "
626 "invalid format specifier %s."),
627 var_get_name (v), var_get_width (v), fmt_string);
635 *report_error = false;
636 return fmt_default_for_width (var_get_width (v));
639 static void parse_value (struct pfm_reader *, int width, union value *);
641 /* Read information on all the variables. */
643 read_variables (struct pfm_reader *r, struct dictionary *dict)
645 char *weight_name = NULL;
649 error (r, _("Expected variable count record."));
651 r->var_cnt = read_int (r);
653 error (r, _("Invalid number of variables %d."), r->var_cnt);
655 /* Purpose of this value is unknown. It is typically 161. */
660 weight_name = read_pool_string (r);
661 if (strlen (weight_name) > SHORT_NAME_LEN)
662 error (r, _("Weight variable name (%s) truncated."), weight_name);
665 for (i = 0; i < r->var_cnt; i++)
671 struct missing_values miss;
672 struct fmt_spec print, write;
673 bool report_error = true;
677 error (r, _("Expected variable record."));
679 width = read_int (r);
681 error (r, _("Invalid variable width %d."), width);
683 read_string (r, name);
684 for (j = 0; j < 6; j++)
685 fmt[j] = read_int (r);
687 if (!dict_id_is_valid (dict, name, false)
688 || *name == '#' || *name == '$')
689 error (r, _("Invalid variable name `%s' in position %d."), name, i);
690 str_uppercase (name);
692 if (width < 0 || width > 255)
693 error (r, _("Bad width %d for variable %s."), width, name);
695 v = dict_create_var (dict, name, width);
701 char try_name[8 + 1 + INT_STRLEN_BOUND (i) + 1];
702 sprintf (try_name, "%s_%lu", name, i);
703 v = dict_create_var (dict, try_name, width);
707 warning (r, _("Duplicate variable name %s in position %d renamed "
708 "to %s."), name, i, var_get_name (v));
711 print = convert_format (r, &fmt[0], v, &report_error);
712 write = convert_format (r, &fmt[3], v, &report_error);
713 var_set_print_format (v, &print);
714 var_set_write_format (v, &write);
716 /* Range missing values. */
717 mv_init (&miss, width);
720 double x = read_float (r);
721 double y = read_float (r);
722 mv_add_range (&miss, x, y);
724 else if (match (r, 'A'))
725 mv_add_range (&miss, read_float (r), HIGHEST);
726 else if (match (r, '9'))
727 mv_add_range (&miss, LOWEST, read_float (r));
729 /* Single missing values. */
730 while (match (r, '8'))
732 int mv_width = MIN (width, 8);
735 parse_value (r, mv_width, &value);
736 value_resize (&value, mv_width, width);
737 mv_add_value (&miss, &value);
738 value_destroy (&value, width);
741 var_set_missing_values (v, &miss);
747 read_string (r, label);
748 var_set_label (v, label, NULL, false); /* XXX */
752 if (weight_name != NULL)
754 struct variable *weight_var = dict_lookup_var (dict, weight_name);
755 if (weight_var == NULL)
756 error (r, _("Weighting variable %s not present in dictionary."),
759 dict_set_weight (dict, weight_var);
763 /* Parse a value of with WIDTH into value V. */
765 parse_value (struct pfm_reader *r, int width, union value *v)
767 value_init (v, width);
771 size_t n_bytes = read_bytes (r, buf);
772 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
775 v->f = read_float (r);
778 /* Parse a value label record and return success. */
780 read_value_label (struct pfm_reader *r, struct dictionary *dict)
792 v = pool_nalloc (r->pool, nv, sizeof *v);
793 for (i = 0; i < nv; i++)
796 read_string (r, name);
798 v[i] = dict_lookup_var (dict, name);
800 error (r, _("Unknown variable %s while parsing value labels."), name);
802 if (var_get_type (v[0]) != var_get_type (v[i]))
803 error (r, _("Cannot assign value labels to %s and %s, which "
804 "have different variable types."),
805 var_get_name (v[0]), var_get_name (v[i]));
808 n_labels = read_int (r);
809 for (i = 0; i < n_labels; i++)
815 parse_value (r, var_get_width (v[0]), &val);
816 read_string (r, label);
818 /* Assign the value label to each variable. */
819 for (j = 0; j < nv; j++)
820 var_replace_value_label (v[j], &val, label);
822 value_destroy (&val, var_get_width (v[0]));
826 /* Reads a set of documents from portable file R into DICT. */
828 read_documents (struct pfm_reader *r, struct dictionary *dict)
833 line_cnt = read_int (r);
834 for (i = 0; i < line_cnt; i++)
837 read_string (r, line);
838 dict_add_document_line (dict, line, false);
842 /* Reads and returns one case from portable file R. Returns a
843 null pointer on failure. */
844 static struct ccase *
845 por_file_casereader_read (struct casereader *reader, void *r_)
847 struct pfm_reader *r = r_;
848 struct ccase *volatile c;
851 c = case_create (r->proto);
852 setjmp (r->bail_out);
855 casereader_force_error (reader);
860 /* Check for end of file. */
867 for (i = 0; i < r->var_cnt; i++)
869 int width = caseproto_get_width (r->proto, i);
872 case_data_rw_idx (c, i)->f = read_float (r);
876 size_t n_bytes = read_bytes (r, buf);
877 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
884 /* Returns true if FILE is an SPSS portable file,
887 pfm_detect (FILE *file)
889 unsigned char header[464];
891 int cooked_cnt, raw_cnt, line_len;
894 cooked_cnt = raw_cnt = 0;
896 while (cooked_cnt < sizeof header)
899 if (c == EOF || raw_cnt++ > 512)
903 while (line_len < 80 && cooked_cnt < sizeof header)
905 header[cooked_cnt++] = ' ';
912 header[cooked_cnt++] = c;
917 memset (trans, 0, 256);
918 for (i = 64; i < 256; i++)
920 unsigned char c = header[i + 200];
922 trans[c] = portable_to_local[i];
925 for (i = 0; i < 8; i++)
926 if (trans[header[i + 456]] != "SPSSPORT"[i])
932 static const struct casereader_class por_file_casereader_class =
934 por_file_casereader_read,
935 por_file_casereader_destroy,