1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/por-file-reader.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/short-names.h"
38 #include "data/value-labels.h"
39 #include "data/variable.h"
40 #include "libpspp/compiler.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/intprops.h"
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
51 #define _(msgid) gettext (msgid)
52 #define N_(msgid) (msgid)
54 /* portable_to_local[PORTABLE] translates the given portable
55 character into the local character set. */
56 static const char portable_to_local[256] =
59 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
60 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
64 /* Portable file reader. */
67 struct pool *pool; /* All the portable file state. */
69 jmp_buf bail_out; /* longjmp() target for error handling. */
71 struct file_handle *fh; /* File handle. */
72 struct fh_lock *lock; /* Read lock for file. */
73 FILE *file; /* File stream. */
74 int line_length; /* Number of characters so far on this line. */
75 char cc; /* Current character. */
76 char *trans; /* 256-byte character set translation table. */
77 int var_cnt; /* Number of variables. */
78 int weight_index; /* 0-based index of weight variable, or -1. */
79 struct caseproto *proto; /* Format of output cases. */
80 bool ok; /* Set false on I/O error. */
83 static const struct casereader_class por_file_casereader_class;
86 error (struct pfm_reader *r, const char *msg,...)
90 /* Displays MSG as an error message and aborts reading the
91 portable file via longjmp(). */
93 error (struct pfm_reader *r, const char *msg, ...)
99 ds_init_empty (&text);
100 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
101 fh_get_file_name (r->fh), (long long int) ftello (r->file));
102 va_start (args, msg);
103 ds_put_vformat (&text, msg, args);
106 m.category = MSG_C_GENERAL;
107 m.severity = MSG_S_ERROR;
108 m.where.file_name = NULL;
109 m.where.line_number = 0;
110 m.where.first_column = 0;
111 m.where.last_column = 0;
112 m.text = ds_cstr (&text);
118 longjmp (r->bail_out, 1);
121 /* Displays MSG as an warning for the current position in
122 portable file reader R. */
124 warning (struct pfm_reader *r, const char *msg, ...)
130 ds_init_empty (&text);
131 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
132 fh_get_file_name (r->fh), (long long int) ftello (r->file));
133 va_start (args, msg);
134 ds_put_vformat (&text, msg, args);
137 m.category = MSG_C_GENERAL;
138 m.severity = MSG_S_WARNING;
139 m.where.file_name = NULL;
140 m.where.line_number = 0;
141 m.where.first_column = 0;
142 m.where.last_column = 0;
143 m.text = ds_cstr (&text);
148 /* Close and destroy R.
149 Returns false if an error was detected on R, true otherwise. */
151 close_reader (struct pfm_reader *r)
159 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
161 msg (ME, _("Error closing portable file `%s': %s."),
162 fh_get_file_name (r->fh), strerror (errno));
172 pool_destroy (r->pool);
177 /* Closes portable file reader R, after we're done with it. */
179 por_file_casereader_destroy (struct casereader *reader, void *r_)
181 struct pfm_reader *r = r_;
182 if (!close_reader (r))
183 casereader_force_error (reader);
186 /* Read a single character into cur_char. */
188 advance (struct pfm_reader *r)
192 /* Read the next character from the file.
193 Ignore carriage returns entirely.
194 Mostly ignore new-lines, but if a new-line occurs before the
195 line has reached 80 bytes in length, then treat the
196 "missing" bytes as spaces. */
199 while ((c = getc (r->file)) == '\r')
204 if (r->line_length < 80)
207 ungetc ('\n', r->file);
213 error (r, _("unexpected end of file"));
215 if (r->trans != NULL)
221 /* Skip a single character if present, and return whether it was
224 match (struct pfm_reader *r, int c)
235 static void read_header (struct pfm_reader *);
236 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
237 static void read_variables (struct pfm_reader *, struct dictionary *);
238 static void read_value_label (struct pfm_reader *, struct dictionary *);
239 static void read_documents (struct pfm_reader *, struct dictionary *);
241 /* Reads the dictionary from file with handle H, and returns it in a
242 dictionary structure. This dictionary may be modified in order to
243 rename, reorder, and delete variables, etc. */
245 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
246 struct pfm_read_info *info)
248 struct pool *volatile pool = NULL;
249 struct pfm_reader *volatile r = NULL;
251 *dict = dict_create ();
253 /* Create and initialize reader. */
254 pool = pool_create ();
255 r = pool_alloc (pool, sizeof *r);
261 r->weight_index = -1;
266 if (setjmp (r->bail_out))
270 /* TRANSLATORS: this fragment will be interpolated into
271 messages in fh_lock() that identify types of files. */
272 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
277 r->file = fn_open (fh_get_file_name (r->fh), "rb");
280 msg (ME, _("An error occurred while opening `%s' for reading "
281 "as a portable file: %s."),
282 fh_get_file_name (r->fh), strerror (errno));
286 /* Read header, version, date info, product id, variables. */
288 read_version_data (r, info);
289 read_variables (r, *dict);
291 /* Read value labels. */
292 while (match (r, 'D'))
293 read_value_label (r, *dict);
295 /* Read documents. */
297 read_documents (r, *dict);
299 /* Check that we've made it to the data. */
301 error (r, _("Data record expected."));
303 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
304 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
305 &por_file_casereader_class, r);
309 dict_destroy (*dict);
314 /* Returns the value of base-30 digit C,
315 or -1 if C is not a base-30 digit. */
317 base_30_value (unsigned char c)
319 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
320 const char *p = strchr (base_30_digits, c);
321 return p != NULL ? p - base_30_digits : -1;
324 /* Read a floating point value and return its value. */
326 read_float (struct pfm_reader *r)
330 bool got_dot = false; /* Seen a decimal point? */
331 bool got_digit = false; /* Seen any digits? */
332 bool negative = false; /* Number is negative? */
334 /* Skip leading spaces. */
335 while (match (r, ' '))
338 /* `*' indicates system-missing. */
341 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
345 negative = match (r, '-');
348 int digit = base_30_value (r->cc);
353 /* Make sure that multiplication by 30 will not overflow. */
354 if (num > DBL_MAX * (1. / 30.))
355 /* The value of the digit doesn't matter, since we have already
356 gotten as many digits as can be represented in a `double'.
357 This doesn't necessarily mean the result will overflow.
358 The exponent may reduce it to within range.
360 We just need to record that there was another
361 digit so that we can multiply by 10 later. */
364 num = (num * 30.0) + digit;
366 /* Keep track of the number of digits after the decimal point.
367 If we just divided by 30 here, we would lose precision. */
371 else if (!got_dot && r->cc == '.')
372 /* Record that we have found the decimal point. */
375 /* Any other character terminates the number. */
381 /* Check that we had some digits. */
383 error (r, _("Number expected."));
385 /* Get exponent if any. */
386 if (r->cc == '+' || r->cc == '-')
389 bool negative_exponent = r->cc == '-';
392 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
394 if (exp > LONG_MAX / 30)
399 exp = exp * 30 + digit;
402 /* We don't check whether there were actually any digits, but we
404 if (negative_exponent)
409 /* Numbers must end with `/'. */
411 error (r, _("Missing numeric terminator."));
413 /* Multiply `num' by 30 to the `exponent' power, checking for
416 num *= pow (30.0, (double) exponent);
417 else if (exponent > 0)
419 if (num > DBL_MAX * pow (30.0, (double) -exponent))
422 num *= pow (30.0, (double) exponent);
425 return negative ? -num : num;
428 /* Read an integer and return its value. */
430 read_int (struct pfm_reader *r)
432 double f = read_float (r);
433 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
434 error (r, _("Invalid integer."));
438 /* Reads a string into BUF, which must have room for 256
441 read_string (struct pfm_reader *r, char *buf)
443 int n = read_int (r);
444 if (n < 0 || n > 255)
445 error (r, _("Bad string length %d."), n);
456 /* Reads a string into BUF, which must have room for 256
458 Returns the number of bytes read.
461 read_bytes (struct pfm_reader *r, uint8_t *buf)
463 int n = read_int (r);
464 if (n < 0 || n > 255)
465 error (r, _("Bad string length %d."), n);
477 /* Reads a string and returns a copy of it allocated from R's
480 read_pool_string (struct pfm_reader *r)
483 read_string (r, string);
484 return pool_strdup (r->pool, string);
487 /* Reads the 464-byte file header. */
489 read_header (struct pfm_reader *r)
494 /* Read and ignore vanity splash strings. */
495 for (i = 0; i < 200; i++)
498 /* Skip the first 64 characters of the translation table.
499 We don't care about these. They are probably all set to
500 '0', marking them as untranslatable, and that would screw
501 up our actual translation of the real '0'. */
502 for (i = 0; i < 64; i++)
505 /* Read the rest of the translation table. */
506 trans = pool_malloc (r->pool, 256);
507 memset (trans, 0, 256);
516 trans[c] = portable_to_local[i];
519 /* Set up the translation table, then read the first
520 translated character. */
524 /* Skip and verify signature. */
525 for (i = 0; i < 8; i++)
526 if (!match (r, "SPSSPORT"[i]))
528 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
529 longjmp (r->bail_out, 1);
533 /* Reads the version and date info record, as well as product and
534 subproduct identification records if present. */
536 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
538 static const char empty_string[] = "";
540 const char *product, *author, *subproduct;
545 error (r, _("Unrecognized version code `%c'."), r->cc);
546 date = read_pool_string (r);
547 time = read_pool_string (r);
548 product = match (r, '1') ? read_pool_string (r) : empty_string;
549 author = match (r, '2') ? read_pool_string (r) : empty_string;
550 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
553 if (strlen (date) != 8)
554 error (r, _("Bad date string length %zu."), strlen (date));
555 if (strlen (time) != 6)
556 error (r, _("Bad time string length %zu."), strlen (time));
558 /* Save file info. */
562 for (i = 0; i < 8; i++)
564 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
565 info->creation_date[map[i]] = date[i];
567 info->creation_date[2] = info->creation_date[5] = ' ';
568 info->creation_date[10] = 0;
571 for (i = 0; i < 6; i++)
573 static const int map[] = {0, 1, 3, 4, 6, 7};
574 info->creation_time[map[i]] = time[i];
576 info->creation_time[2] = info->creation_time[5] = ' ';
577 info->creation_time[8] = 0;
580 str_copy_trunc (info->product, sizeof info->product, product);
581 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
585 /* Translates a format specification read from portable file R as
586 the three integers INTS into a normal format specifier FORMAT,
587 checking that the format is appropriate for variable V. */
588 static struct fmt_spec
589 convert_format (struct pfm_reader *r, const int portable_format[3],
590 struct variable *v, bool *report_error)
592 struct fmt_spec format;
595 if (!fmt_from_io (portable_format[0], &format.type))
598 warning (r, _("%s: Bad format specifier byte (%d). Variable "
599 "will be assigned a default format."),
600 var_get_name (v), portable_format[0]);
604 format.w = portable_format[1];
605 format.d = portable_format[2];
608 ok = (fmt_check_output (&format)
609 && fmt_check_width_compat (&format, var_get_width (v)));
616 char fmt_string[FMT_STRING_LEN_MAX + 1];
617 fmt_to_string (&format, fmt_string);
618 if (var_is_numeric (v))
619 warning (r, _("Numeric variable %s has invalid format "
621 var_get_name (v), fmt_string);
623 warning (r, _("String variable %s with width %d has "
624 "invalid format specifier %s."),
625 var_get_name (v), var_get_width (v), fmt_string);
633 *report_error = false;
634 return fmt_default_for_width (var_get_width (v));
637 static void parse_value (struct pfm_reader *, int width, union value *);
639 /* Read information on all the variables. */
641 read_variables (struct pfm_reader *r, struct dictionary *dict)
643 char *weight_name = NULL;
647 error (r, _("Expected variable count record."));
649 r->var_cnt = read_int (r);
651 error (r, _("Invalid number of variables %d."), r->var_cnt);
653 /* Purpose of this value is unknown. It is typically 161. */
658 weight_name = read_pool_string (r);
659 if (strlen (weight_name) > SHORT_NAME_LEN)
660 error (r, _("Weight variable name (%s) truncated."), weight_name);
663 for (i = 0; i < r->var_cnt; i++)
669 struct missing_values miss;
670 struct fmt_spec print, write;
671 bool report_error = true;
675 error (r, _("Expected variable record."));
677 width = read_int (r);
679 error (r, _("Invalid variable width %d."), width);
681 read_string (r, name);
682 for (j = 0; j < 6; j++)
683 fmt[j] = read_int (r);
685 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
686 error (r, _("Invalid variable name `%s' in position %d."), name, i);
687 str_uppercase (name);
689 if (width < 0 || width > 255)
690 error (r, _("Bad width %d for variable %s."), width, name);
692 v = dict_create_var (dict, name, width);
698 char try_name[8 + 1 + INT_STRLEN_BOUND (i) + 1];
699 sprintf (try_name, "%s_%lu", name, i);
700 v = dict_create_var (dict, try_name, width);
704 warning (r, _("Duplicate variable name %s in position %d renamed "
705 "to %s."), name, i, var_get_name (v));
708 print = convert_format (r, &fmt[0], v, &report_error);
709 write = convert_format (r, &fmt[3], v, &report_error);
710 var_set_print_format (v, &print);
711 var_set_write_format (v, &write);
713 /* Range missing values. */
714 mv_init (&miss, width);
717 double x = read_float (r);
718 double y = read_float (r);
719 mv_add_range (&miss, x, y);
721 else if (match (r, 'A'))
722 mv_add_range (&miss, read_float (r), HIGHEST);
723 else if (match (r, '9'))
724 mv_add_range (&miss, LOWEST, read_float (r));
726 /* Single missing values. */
727 while (match (r, '8'))
729 int mv_width = MIN (width, 8);
732 parse_value (r, mv_width, &value);
733 value_resize (&value, mv_width, width);
734 mv_add_value (&miss, &value);
735 value_destroy (&value, width);
738 var_set_missing_values (v, &miss);
744 read_string (r, label);
745 var_set_label (v, label);
749 if (weight_name != NULL)
751 struct variable *weight_var = dict_lookup_var (dict, weight_name);
752 if (weight_var == NULL)
753 error (r, _("Weighting variable %s not present in dictionary."),
756 dict_set_weight (dict, weight_var);
760 /* Parse a value of with WIDTH into value V. */
762 parse_value (struct pfm_reader *r, int width, union value *v)
764 value_init (v, width);
768 size_t n_bytes = read_bytes (r, buf);
769 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
772 v->f = read_float (r);
775 /* Parse a value label record and return success. */
777 read_value_label (struct pfm_reader *r, struct dictionary *dict)
789 v = pool_nalloc (r->pool, nv, sizeof *v);
790 for (i = 0; i < nv; i++)
793 read_string (r, name);
795 v[i] = dict_lookup_var (dict, name);
797 error (r, _("Unknown variable %s while parsing value labels."), name);
799 if (var_get_type (v[0]) != var_get_type (v[i]))
800 error (r, _("Cannot assign value labels to %s and %s, which "
801 "have different variable types."),
802 var_get_name (v[0]), var_get_name (v[i]));
805 n_labels = read_int (r);
806 for (i = 0; i < n_labels; i++)
812 parse_value (r, var_get_width (v[0]), &val);
813 read_string (r, label);
815 /* Assign the value label to each variable. */
816 for (j = 0; j < nv; j++)
817 var_replace_value_label (v[j], &val, label);
819 value_destroy (&val, var_get_width (v[0]));
823 /* Reads a set of documents from portable file R into DICT. */
825 read_documents (struct pfm_reader *r, struct dictionary *dict)
830 line_cnt = read_int (r);
831 for (i = 0; i < line_cnt; i++)
834 read_string (r, line);
835 dict_add_document_line (dict, line);
839 /* Reads and returns one case from portable file R. Returns a
840 null pointer on failure. */
841 static struct ccase *
842 por_file_casereader_read (struct casereader *reader, void *r_)
844 struct pfm_reader *r = r_;
845 struct ccase *volatile c;
848 c = case_create (r->proto);
849 setjmp (r->bail_out);
852 casereader_force_error (reader);
857 /* Check for end of file. */
864 for (i = 0; i < r->var_cnt; i++)
866 int width = caseproto_get_width (r->proto, i);
869 case_data_rw_idx (c, i)->f = read_float (r);
873 size_t n_bytes = read_bytes (r, buf);
874 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
881 /* Returns true if FILE is an SPSS portable file,
884 pfm_detect (FILE *file)
886 unsigned char header[464];
888 int cooked_cnt, raw_cnt, line_len;
891 cooked_cnt = raw_cnt = 0;
893 while (cooked_cnt < sizeof header)
896 if (c == EOF || raw_cnt++ > 512)
900 while (line_len < 80 && cooked_cnt < sizeof header)
902 header[cooked_cnt++] = ' ';
909 header[cooked_cnt++] = c;
914 memset (trans, 0, 256);
915 for (i = 64; i < 256; i++)
917 unsigned char c = header[i + 200];
919 trans[c] = portable_to_local[i];
922 for (i = 0; i < 8; i++)
923 if (trans[header[i + 456]] != "SPSSPORT"[i])
929 static const struct casereader_class por_file_casereader_class =
931 por_file_casereader_read,
932 por_file_casereader_destroy,