1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/short-names.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <libpspp/misc.h>
42 #include <libpspp/pool.h>
43 #include <libpspp/str.h>
49 #define _(msgid) gettext (msgid)
50 #define N_(msgid) (msgid)
52 /* portable_to_local[PORTABLE] translates the given portable
53 character into the local character set. */
54 static const char portable_to_local[256] =
57 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
58 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
62 /* Portable file reader. */
65 struct pool *pool; /* All the portable file state. */
67 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 struct fh_lock *lock; /* Read lock for file. */
71 FILE *file; /* File stream. */
72 int line_length; /* Number of characters so far on this line. */
73 char cc; /* Current character. */
74 char *trans; /* 256-byte character set translation table. */
75 int var_cnt; /* Number of variables. */
76 int weight_index; /* 0-based index of weight variable, or -1. */
77 struct caseproto *proto; /* Format of output cases. */
78 bool ok; /* Set false on I/O error. */
81 static const struct casereader_class por_file_casereader_class;
84 error (struct pfm_reader *r, const char *msg,...)
88 /* Displays MSG as an error message and aborts reading the
89 portable file via longjmp(). */
91 error (struct pfm_reader *r, const char *msg, ...)
97 ds_init_empty (&text);
98 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
99 fh_get_file_name (r->fh), (long long int) ftello (r->file));
100 va_start (args, msg);
101 ds_put_vformat (&text, msg, args);
104 m.category = MSG_C_GENERAL;
105 m.severity = MSG_S_ERROR;
106 m.where.file_name = NULL;
107 m.where.line_number = 0;
108 m.text = ds_cstr (&text);
114 longjmp (r->bail_out, 1);
117 /* Displays MSG as an warning for the current position in
118 portable file reader R. */
120 warning (struct pfm_reader *r, const char *msg, ...)
126 ds_init_empty (&text);
127 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
128 fh_get_file_name (r->fh), (long long int) ftello (r->file));
129 va_start (args, msg);
130 ds_put_vformat (&text, msg, args);
133 m.category = MSG_C_GENERAL;
134 m.severity = MSG_S_WARNING;
135 m.where.file_name = NULL;
136 m.where.line_number = 0;
137 m.text = ds_cstr (&text);
142 /* Close and destroy R.
143 Returns false if an error was detected on R, true otherwise. */
145 close_reader (struct pfm_reader *r)
153 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
155 msg (ME, _("Error closing portable file \"%s\": %s."),
156 fh_get_file_name (r->fh), strerror (errno));
166 pool_destroy (r->pool);
171 /* Closes portable file reader R, after we're done with it. */
173 por_file_casereader_destroy (struct casereader *reader, void *r_)
175 struct pfm_reader *r = r_;
176 if (!close_reader (r))
177 casereader_force_error (reader);
180 /* Read a single character into cur_char. */
182 advance (struct pfm_reader *r)
186 /* Read the next character from the file.
187 Ignore carriage returns entirely.
188 Mostly ignore new-lines, but if a new-line occurs before the
189 line has reached 80 bytes in length, then treat the
190 "missing" bytes as spaces. */
193 while ((c = getc (r->file)) == '\r')
198 if (r->line_length < 80)
201 ungetc ('\n', r->file);
207 error (r, _("unexpected end of file"));
209 if (r->trans != NULL)
215 /* Skip a single character if present, and return whether it was
218 match (struct pfm_reader *r, int c)
229 static void read_header (struct pfm_reader *);
230 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
231 static void read_variables (struct pfm_reader *, struct dictionary *);
232 static void read_value_label (struct pfm_reader *, struct dictionary *);
233 static void read_documents (struct pfm_reader *, struct dictionary *);
235 /* Reads the dictionary from file with handle H, and returns it in a
236 dictionary structure. This dictionary may be modified in order to
237 rename, reorder, and delete variables, etc. */
239 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
240 struct pfm_read_info *info)
242 struct pool *volatile pool = NULL;
243 struct pfm_reader *volatile r = NULL;
245 *dict = dict_create ();
247 /* Create and initialize reader. */
248 pool = pool_create ();
249 r = pool_alloc (pool, sizeof *r);
255 r->weight_index = -1;
260 if (setjmp (r->bail_out))
264 /* TRANSLATORS: this fragment will be interpolated into
265 messages in fh_lock() that identify types of files. */
266 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
271 r->file = fn_open (fh_get_file_name (r->fh), "rb");
274 msg (ME, _("An error occurred while opening \"%s\" for reading "
275 "as a portable file: %s."),
276 fh_get_file_name (r->fh), strerror (errno));
280 /* Read header, version, date info, product id, variables. */
282 read_version_data (r, info);
283 read_variables (r, *dict);
285 /* Read value labels. */
286 while (match (r, 'D'))
287 read_value_label (r, *dict);
289 /* Read documents. */
291 read_documents (r, *dict);
293 /* Check that we've made it to the data. */
295 error (r, _("Data record expected."));
297 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
298 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
299 &por_file_casereader_class, r);
303 dict_destroy (*dict);
308 /* Returns the value of base-30 digit C,
309 or -1 if C is not a base-30 digit. */
311 base_30_value (unsigned char c)
313 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
314 const char *p = strchr (base_30_digits, c);
315 return p != NULL ? p - base_30_digits : -1;
318 /* Read a floating point value and return its value. */
320 read_float (struct pfm_reader *r)
324 bool got_dot = false; /* Seen a decimal point? */
325 bool got_digit = false; /* Seen any digits? */
326 bool negative = false; /* Number is negative? */
328 /* Skip leading spaces. */
329 while (match (r, ' '))
332 /* `*' indicates system-missing. */
335 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
339 negative = match (r, '-');
342 int digit = base_30_value (r->cc);
347 /* Make sure that multiplication by 30 will not overflow. */
348 if (num > DBL_MAX * (1. / 30.))
349 /* The value of the digit doesn't matter, since we have already
350 gotten as many digits as can be represented in a `double'.
351 This doesn't necessarily mean the result will overflow.
352 The exponent may reduce it to within range.
354 We just need to record that there was another
355 digit so that we can multiply by 10 later. */
358 num = (num * 30.0) + digit;
360 /* Keep track of the number of digits after the decimal point.
361 If we just divided by 30 here, we would lose precision. */
365 else if (!got_dot && r->cc == '.')
366 /* Record that we have found the decimal point. */
369 /* Any other character terminates the number. */
375 /* Check that we had some digits. */
377 error (r, _("Number expected."));
379 /* Get exponent if any. */
380 if (r->cc == '+' || r->cc == '-')
383 bool negative_exponent = r->cc == '-';
386 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
388 if (exp > LONG_MAX / 30)
393 exp = exp * 30 + digit;
396 /* We don't check whether there were actually any digits, but we
398 if (negative_exponent)
403 /* Numbers must end with `/'. */
405 error (r, _("Missing numeric terminator."));
407 /* Multiply `num' by 30 to the `exponent' power, checking for
410 num *= pow (30.0, (double) exponent);
411 else if (exponent > 0)
413 if (num > DBL_MAX * pow (30.0, (double) -exponent))
416 num *= pow (30.0, (double) exponent);
419 return negative ? -num : num;
422 /* Read an integer and return its value. */
424 read_int (struct pfm_reader *r)
426 double f = read_float (r);
427 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
428 error (r, _("Invalid integer."));
432 /* Reads a string into BUF, which must have room for 256
435 read_string (struct pfm_reader *r, char *buf)
437 int n = read_int (r);
438 if (n < 0 || n > 255)
439 error (r, _("Bad string length %d."), n);
450 /* Reads a string into BUF, which must have room for 256
452 Returns the number of bytes read.
455 read_bytes (struct pfm_reader *r, uint8_t *buf)
457 int n = read_int (r);
458 if (n < 0 || n > 255)
459 error (r, _("Bad string length %d."), n);
471 /* Reads a string and returns a copy of it allocated from R's
474 read_pool_string (struct pfm_reader *r)
477 read_string (r, string);
478 return pool_strdup (r->pool, string);
481 /* Reads the 464-byte file header. */
483 read_header (struct pfm_reader *r)
488 /* Read and ignore vanity splash strings. */
489 for (i = 0; i < 200; i++)
492 /* Skip the first 64 characters of the translation table.
493 We don't care about these. They are probably all set to
494 '0', marking them as untranslatable, and that would screw
495 up our actual translation of the real '0'. */
496 for (i = 0; i < 64; i++)
499 /* Read the rest of the translation table. */
500 trans = pool_malloc (r->pool, 256);
501 memset (trans, 0, 256);
510 trans[c] = portable_to_local[i];
513 /* Set up the translation table, then read the first
514 translated character. */
518 /* Skip and verify signature. */
519 for (i = 0; i < 8; i++)
520 if (!match (r, "SPSSPORT"[i]))
522 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
523 longjmp (r->bail_out, 1);
527 /* Reads the version and date info record, as well as product and
528 subproduct identification records if present. */
530 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
532 static const char empty_string[] = "";
534 const char *product, *author, *subproduct;
539 error (r, _("Unrecognized version code `%c'."), r->cc);
540 date = read_pool_string (r);
541 time = read_pool_string (r);
542 product = match (r, '1') ? read_pool_string (r) : empty_string;
543 author = match (r, '2') ? read_pool_string (r) : empty_string;
544 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
547 if (strlen (date) != 8)
548 error (r, _("Bad date string length %zu."), strlen (date));
549 if (strlen (time) != 6)
550 error (r, _("Bad time string length %zu."), strlen (time));
552 /* Save file info. */
556 for (i = 0; i < 8; i++)
558 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
559 info->creation_date[map[i]] = date[i];
561 info->creation_date[2] = info->creation_date[5] = ' ';
562 info->creation_date[10] = 0;
565 for (i = 0; i < 6; i++)
567 static const int map[] = {0, 1, 3, 4, 6, 7};
568 info->creation_time[map[i]] = time[i];
570 info->creation_time[2] = info->creation_time[5] = ' ';
571 info->creation_time[8] = 0;
574 str_copy_trunc (info->product, sizeof info->product, product);
575 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
579 /* Translates a format specification read from portable file R as
580 the three integers INTS into a normal format specifier FORMAT,
581 checking that the format is appropriate for variable V. */
582 static struct fmt_spec
583 convert_format (struct pfm_reader *r, const int portable_format[3],
584 struct variable *v, bool *report_error)
586 struct fmt_spec format;
589 if (!fmt_from_io (portable_format[0], &format.type))
592 warning (r, _("%s: Bad format specifier byte (%d). Variable "
593 "will be assigned a default format."),
594 var_get_name (v), portable_format[0]);
598 format.w = portable_format[1];
599 format.d = portable_format[2];
602 ok = (fmt_check_output (&format)
603 && fmt_check_width_compat (&format, var_get_width (v)));
610 char fmt_string[FMT_STRING_LEN_MAX + 1];
611 fmt_to_string (&format, fmt_string);
612 if (var_is_numeric (v))
613 warning (r, _("Numeric variable %s has invalid format "
615 var_get_name (v), fmt_string);
617 warning (r, _("String variable %s with width %d has "
618 "invalid format specifier %s."),
619 var_get_name (v), var_get_width (v), fmt_string);
627 *report_error = false;
628 return fmt_default_for_width (var_get_width (v));
631 static void parse_value (struct pfm_reader *, int width, union value *);
633 /* Read information on all the variables. */
635 read_variables (struct pfm_reader *r, struct dictionary *dict)
637 char *weight_name = NULL;
641 error (r, _("Expected variable count record."));
643 r->var_cnt = read_int (r);
645 error (r, _("Invalid number of variables %d."), r->var_cnt);
647 /* Purpose of this value is unknown. It is typically 161. */
652 weight_name = read_pool_string (r);
653 if (strlen (weight_name) > SHORT_NAME_LEN)
654 error (r, _("Weight variable name (%s) truncated."), weight_name);
657 for (i = 0; i < r->var_cnt; i++)
663 struct missing_values miss;
664 struct fmt_spec print, write;
665 bool report_error = true;
669 error (r, _("Expected variable record."));
671 width = read_int (r);
673 error (r, _("Invalid variable width %d."), width);
675 read_string (r, name);
676 for (j = 0; j < 6; j++)
677 fmt[j] = read_int (r);
679 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
680 error (r, _("Invalid variable name `%s' in position %d."), name, i);
681 str_uppercase (name);
683 if (width < 0 || width > 255)
684 error (r, _("Bad width %d for variable %s."), width, name);
686 v = dict_create_var (dict, name, width);
690 for (i = 1; i < 100000; i++)
692 char try_name[VAR_NAME_LEN + 1];
693 sprintf (try_name, "%.*s_%d", VAR_NAME_LEN - 6, name, i);
694 v = dict_create_var (dict, try_name, width);
699 error (r, _("Duplicate variable name %s in position %d."), name, i);
700 warning (r, _("Duplicate variable name %s in position %d renamed "
701 "to %s."), name, i, var_get_name (v));
704 print = convert_format (r, &fmt[0], v, &report_error);
705 write = convert_format (r, &fmt[3], v, &report_error);
706 var_set_print_format (v, &print);
707 var_set_write_format (v, &write);
709 /* Range missing values. */
710 mv_init (&miss, width);
713 double x = read_float (r);
714 double y = read_float (r);
715 mv_add_range (&miss, x, y);
717 else if (match (r, 'A'))
718 mv_add_range (&miss, read_float (r), HIGHEST);
719 else if (match (r, '9'))
720 mv_add_range (&miss, LOWEST, read_float (r));
722 /* Single missing values. */
723 while (match (r, '8'))
725 int mv_width = MIN (width, 8);
728 parse_value (r, mv_width, &value);
729 value_resize (&value, mv_width, width);
730 mv_add_value (&miss, &value);
731 value_destroy (&value, width);
734 var_set_missing_values (v, &miss);
740 read_string (r, label);
741 var_set_label (v, label);
745 if (weight_name != NULL)
747 struct variable *weight_var = dict_lookup_var (dict, weight_name);
748 if (weight_var == NULL)
749 error (r, _("Weighting variable %s not present in dictionary."),
752 dict_set_weight (dict, weight_var);
756 /* Parse a value of with WIDTH into value V. */
758 parse_value (struct pfm_reader *r, int width, union value *v)
760 value_init (v, width);
764 size_t n_bytes = read_bytes (r, buf);
765 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
768 v->f = read_float (r);
771 /* Parse a value label record and return success. */
773 read_value_label (struct pfm_reader *r, struct dictionary *dict)
785 v = pool_nalloc (r->pool, nv, sizeof *v);
786 for (i = 0; i < nv; i++)
789 read_string (r, name);
791 v[i] = dict_lookup_var (dict, name);
793 error (r, _("Unknown variable %s while parsing value labels."), name);
795 if (var_get_type (v[0]) != var_get_type (v[i]))
796 error (r, _("Cannot assign value labels to %s and %s, which "
797 "have different variable types."),
798 var_get_name (v[0]), var_get_name (v[i]));
801 n_labels = read_int (r);
802 for (i = 0; i < n_labels; i++)
808 parse_value (r, var_get_width (v[0]), &val);
809 read_string (r, label);
811 /* Assign the value label to each variable. */
812 for (j = 0; j < nv; j++)
813 var_replace_value_label (v[j], &val, label);
815 value_destroy (&val, var_get_width (v[0]));
819 /* Reads a set of documents from portable file R into DICT. */
821 read_documents (struct pfm_reader *r, struct dictionary *dict)
826 line_cnt = read_int (r);
827 for (i = 0; i < line_cnt; i++)
830 read_string (r, line);
831 dict_add_document_line (dict, line);
835 /* Reads and returns one case from portable file R. Returns a
836 null pointer on failure. */
837 static struct ccase *
838 por_file_casereader_read (struct casereader *reader, void *r_)
840 struct pfm_reader *r = r_;
841 struct ccase *volatile c;
844 c = case_create (r->proto);
845 setjmp (r->bail_out);
848 casereader_force_error (reader);
853 /* Check for end of file. */
860 for (i = 0; i < r->var_cnt; i++)
862 int width = caseproto_get_width (r->proto, i);
865 case_data_rw_idx (c, i)->f = read_float (r);
869 size_t n_bytes = read_bytes (r, buf);
870 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
877 /* Returns true if FILE is an SPSS portable file,
880 pfm_detect (FILE *file)
882 unsigned char header[464];
884 int cooked_cnt, raw_cnt, line_len;
887 cooked_cnt = raw_cnt = 0;
889 while (cooked_cnt < sizeof header)
892 if (c == EOF || raw_cnt++ > 512)
896 while (line_len < 80 && cooked_cnt < sizeof header)
898 header[cooked_cnt++] = ' ';
905 header[cooked_cnt++] = c;
910 memset (trans, 0, 256);
911 for (i = 64; i < 256; i++)
913 unsigned char c = header[i + 200];
915 trans[c] = portable_to_local[i];
918 for (i = 0; i < 8; i++)
919 if (trans[header[i + 456]] != "SPSSPORT"[i])
925 static const struct casereader_class por_file_casereader_class =
927 por_file_casereader_read,
928 por_file_casereader_destroy,