1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/short-names.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/message.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/pool.h>
44 #include <libpspp/str.h>
50 #define _(msgid) gettext (msgid)
51 #define N_(msgid) (msgid)
53 /* portable_to_local[PORTABLE] translates the given portable
54 character into the local character set. */
55 static const char portable_to_local[256] =
58 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
59 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
63 /* Portable file reader. */
66 struct pool *pool; /* All the portable file state. */
68 jmp_buf bail_out; /* longjmp() target for error handling. */
70 struct file_handle *fh; /* File handle. */
71 struct fh_lock *lock; /* Read lock for file. */
72 FILE *file; /* File stream. */
73 int line_length; /* Number of characters so far on this line. */
74 char cc; /* Current character. */
75 char *trans; /* 256-byte character set translation table. */
76 int var_cnt; /* Number of variables. */
77 int weight_index; /* 0-based index of weight variable, or -1. */
78 struct caseproto *proto; /* Format of output cases. */
79 bool ok; /* Set false on I/O error. */
82 static const struct casereader_class por_file_casereader_class;
85 error (struct pfm_reader *r, const char *msg,...)
89 /* Displays MSG as an error message and aborts reading the
90 portable file via longjmp(). */
92 error (struct pfm_reader *r, const char *msg, ...)
98 ds_init_empty (&text);
99 ds_put_format (&text, _("portable file %s corrupt at offset 0x%lx: "),
100 fh_get_file_name (r->fh), ftell (r->file));
101 va_start (args, msg);
102 ds_put_vformat (&text, msg, args);
105 m.category = MSG_GENERAL;
106 m.severity = MSG_ERROR;
107 m.where.file_name = NULL;
108 m.where.line_number = 0;
109 m.text = ds_cstr (&text);
115 longjmp (r->bail_out, 1);
118 /* Displays MSG as an warning for the current position in
119 portable file reader R. */
121 warning (struct pfm_reader *r, const char *msg, ...)
127 ds_init_empty (&text);
128 ds_put_format (&text, _("reading portable file %s at offset 0x%lx: "),
129 fh_get_file_name (r->fh), ftell (r->file));
130 va_start (args, msg);
131 ds_put_vformat (&text, msg, args);
134 m.category = MSG_GENERAL;
135 m.severity = MSG_WARNING;
136 m.where.file_name = NULL;
137 m.where.line_number = 0;
138 m.text = ds_cstr (&text);
143 /* Close and destroy R.
144 Returns false if an error was detected on R, true otherwise. */
146 close_reader (struct pfm_reader *r)
154 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
156 msg (ME, _("Error closing portable file \"%s\": %s."),
157 fh_get_file_name (r->fh), strerror (errno));
167 pool_destroy (r->pool);
172 /* Closes portable file reader R, after we're done with it. */
174 por_file_casereader_destroy (struct casereader *reader, void *r_)
176 struct pfm_reader *r = r_;
177 if (!close_reader (r))
178 casereader_force_error (reader);
181 /* Read a single character into cur_char. */
183 advance (struct pfm_reader *r)
187 /* Read the next character from the file.
188 Ignore carriage returns entirely.
189 Mostly ignore new-lines, but if a new-line occurs before the
190 line has reached 80 bytes in length, then treat the
191 "missing" bytes as spaces. */
194 while ((c = getc (r->file)) == '\r')
199 if (r->line_length < 80)
202 ungetc ('\n', r->file);
208 error (r, _("unexpected end of file"));
210 if (r->trans != NULL)
216 /* Skip a single character if present, and return whether it was
219 match (struct pfm_reader *r, int c)
230 static void read_header (struct pfm_reader *);
231 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
232 static void read_variables (struct pfm_reader *, struct dictionary *);
233 static void read_value_label (struct pfm_reader *, struct dictionary *);
234 static void read_documents (struct pfm_reader *, struct dictionary *);
236 /* Reads the dictionary from file with handle H, and returns it in a
237 dictionary structure. This dictionary may be modified in order to
238 rename, reorder, and delete variables, etc. */
240 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
241 struct pfm_read_info *info)
243 struct pool *volatile pool = NULL;
244 struct pfm_reader *volatile r = NULL;
246 *dict = dict_create ();
248 /* Create and initialize reader. */
249 pool = pool_create ();
250 r = pool_alloc (pool, sizeof *r);
256 r->weight_index = -1;
261 if (setjmp (r->bail_out))
265 /* TRANSLATORS: this fragment will be interpolated into
266 messages in fh_lock() that identify types of files. */
267 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
272 r->file = fn_open (fh_get_file_name (r->fh), "rb");
275 msg (ME, _("An error occurred while opening \"%s\" for reading "
276 "as a portable file: %s."),
277 fh_get_file_name (r->fh), strerror (errno));
281 /* Read header, version, date info, product id, variables. */
283 read_version_data (r, info);
284 read_variables (r, *dict);
286 /* Read value labels. */
287 while (match (r, 'D'))
288 read_value_label (r, *dict);
290 /* Read documents. */
292 read_documents (r, *dict);
294 /* Check that we've made it to the data. */
296 error (r, _("Data record expected."));
298 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
299 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
300 &por_file_casereader_class, r);
304 dict_destroy (*dict);
309 /* Returns the value of base-30 digit C,
310 or -1 if C is not a base-30 digit. */
312 base_30_value (unsigned char c)
314 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
315 const char *p = strchr (base_30_digits, c);
316 return p != NULL ? p - base_30_digits : -1;
319 /* Read a floating point value and return its value. */
321 read_float (struct pfm_reader *r)
325 bool got_dot = false; /* Seen a decimal point? */
326 bool got_digit = false; /* Seen any digits? */
327 bool negative = false; /* Number is negative? */
329 /* Skip leading spaces. */
330 while (match (r, ' '))
333 /* `*' indicates system-missing. */
336 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
340 negative = match (r, '-');
343 int digit = base_30_value (r->cc);
348 /* Make sure that multiplication by 30 will not overflow. */
349 if (num > DBL_MAX * (1. / 30.))
350 /* The value of the digit doesn't matter, since we have already
351 gotten as many digits as can be represented in a `double'.
352 This doesn't necessarily mean the result will overflow.
353 The exponent may reduce it to within range.
355 We just need to record that there was another
356 digit so that we can multiply by 10 later. */
359 num = (num * 30.0) + digit;
361 /* Keep track of the number of digits after the decimal point.
362 If we just divided by 30 here, we would lose precision. */
366 else if (!got_dot && r->cc == '.')
367 /* Record that we have found the decimal point. */
370 /* Any other character terminates the number. */
376 /* Check that we had some digits. */
378 error (r, _("Number expected."));
380 /* Get exponent if any. */
381 if (r->cc == '+' || r->cc == '-')
384 bool negative_exponent = r->cc == '-';
387 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
389 if (exp > LONG_MAX / 30)
394 exp = exp * 30 + digit;
397 /* We don't check whether there were actually any digits, but we
399 if (negative_exponent)
404 /* Numbers must end with `/'. */
406 error (r, _("Missing numeric terminator."));
408 /* Multiply `num' by 30 to the `exponent' power, checking for
411 num *= pow (30.0, (double) exponent);
412 else if (exponent > 0)
414 if (num > DBL_MAX * pow (30.0, (double) -exponent))
417 num *= pow (30.0, (double) exponent);
420 return negative ? -num : num;
423 /* Read an integer and return its value. */
425 read_int (struct pfm_reader *r)
427 double f = read_float (r);
428 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
429 error (r, _("Invalid integer."));
433 /* Reads a string into BUF, which must have room for 256
436 read_string (struct pfm_reader *r, char *buf)
438 int n = read_int (r);
439 if (n < 0 || n > 255)
440 error (r, _("Bad string length %d."), n);
451 /* Reads a string into BUF, which must have room for 256
453 Returns the number of bytes read.
456 read_bytes (struct pfm_reader *r, uint8_t *buf)
458 int n = read_int (r);
459 if (n < 0 || n > 255)
460 error (r, _("Bad string length %d."), n);
472 /* Reads a string and returns a copy of it allocated from R's
475 read_pool_string (struct pfm_reader *r)
478 read_string (r, string);
479 return pool_strdup (r->pool, string);
482 /* Reads the 464-byte file header. */
484 read_header (struct pfm_reader *r)
489 /* Read and ignore vanity splash strings. */
490 for (i = 0; i < 200; i++)
493 /* Skip the first 64 characters of the translation table.
494 We don't care about these. They are probably all set to
495 '0', marking them as untranslatable, and that would screw
496 up our actual translation of the real '0'. */
497 for (i = 0; i < 64; i++)
500 /* Read the rest of the translation table. */
501 trans = pool_malloc (r->pool, 256);
502 memset (trans, 0, 256);
511 trans[c] = portable_to_local[i];
514 /* Set up the translation table, then read the first
515 translated character. */
519 /* Skip and verify signature. */
520 for (i = 0; i < 8; i++)
521 if (!match (r, "SPSSPORT"[i]))
523 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
524 longjmp (r->bail_out, 1);
528 /* Reads the version and date info record, as well as product and
529 subproduct identification records if present. */
531 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
533 static const char empty_string[] = "";
535 const char *product, *author, *subproduct;
540 error (r, _("Unrecognized version code `%c'."), r->cc);
541 date = read_pool_string (r);
542 time = read_pool_string (r);
543 product = match (r, '1') ? read_pool_string (r) : empty_string;
544 author = match (r, '2') ? read_pool_string (r) : empty_string;
545 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
548 if (strlen (date) != 8)
549 error (r, _("Bad date string length %zu."), strlen (date));
550 if (strlen (time) != 6)
551 error (r, _("Bad time string length %zu."), strlen (time));
553 /* Save file info. */
557 for (i = 0; i < 8; i++)
559 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
560 info->creation_date[map[i]] = date[i];
562 info->creation_date[2] = info->creation_date[5] = ' ';
563 info->creation_date[10] = 0;
566 for (i = 0; i < 6; i++)
568 static const int map[] = {0, 1, 3, 4, 6, 7};
569 info->creation_time[map[i]] = time[i];
571 info->creation_time[2] = info->creation_time[5] = ' ';
572 info->creation_time[8] = 0;
575 str_copy_trunc (info->product, sizeof info->product, product);
576 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
580 /* Translates a format specification read from portable file R as
581 the three integers INTS into a normal format specifier FORMAT,
582 checking that the format is appropriate for variable V. */
583 static struct fmt_spec
584 convert_format (struct pfm_reader *r, const int portable_format[3],
585 struct variable *v, bool *report_error)
587 struct fmt_spec format;
590 if (!fmt_from_io (portable_format[0], &format.type))
593 warning (r, _("%s: Bad format specifier byte (%d). Variable "
594 "will be assigned a default format."),
595 var_get_name (v), portable_format[0]);
599 format.w = portable_format[1];
600 format.d = portable_format[2];
603 ok = (fmt_check_output (&format)
604 && fmt_check_width_compat (&format, var_get_width (v)));
611 char fmt_string[FMT_STRING_LEN_MAX + 1];
612 fmt_to_string (&format, fmt_string);
613 if (var_is_numeric (v))
614 warning (r, _("Numeric variable %s has invalid format "
616 var_get_name (v), fmt_string);
618 warning (r, _("String variable %s with width %d has "
619 "invalid format specifier %s."),
620 var_get_name (v), var_get_width (v), fmt_string);
628 *report_error = false;
629 return fmt_default_for_width (var_get_width (v));
632 static void parse_value (struct pfm_reader *, int width, union value *);
634 /* Read information on all the variables. */
636 read_variables (struct pfm_reader *r, struct dictionary *dict)
638 char *weight_name = NULL;
642 error (r, _("Expected variable count record."));
644 r->var_cnt = read_int (r);
646 error (r, _("Invalid number of variables %d."), r->var_cnt);
648 /* Purpose of this value is unknown. It is typically 161. */
653 weight_name = read_pool_string (r);
654 if (strlen (weight_name) > SHORT_NAME_LEN)
655 error (r, _("Weight variable name (%s) truncated."), weight_name);
658 for (i = 0; i < r->var_cnt; i++)
664 struct missing_values miss;
665 struct fmt_spec print, write;
666 bool report_error = true;
670 error (r, _("Expected variable record."));
672 width = read_int (r);
674 error (r, _("Invalid variable width %d."), width);
676 read_string (r, name);
677 for (j = 0; j < 6; j++)
678 fmt[j] = read_int (r);
680 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
681 error (r, _("Invalid variable name `%s' in position %d."), name, i);
682 str_uppercase (name);
684 if (width < 0 || width > 255)
685 error (r, _("Bad width %d for variable %s."), width, name);
687 v = dict_create_var (dict, name, width);
691 for (i = 1; i < 100000; i++)
693 char try_name[VAR_NAME_LEN + 1];
694 sprintf (try_name, "%.*s_%d", VAR_NAME_LEN - 6, name, i);
695 v = dict_create_var (dict, try_name, width);
700 error (r, _("Duplicate variable name %s in position %d."), name, i);
701 warning (r, _("Duplicate variable name %s in position %d renamed "
702 "to %s."), name, i, var_get_name (v));
705 print = convert_format (r, &fmt[0], v, &report_error);
706 write = convert_format (r, &fmt[3], v, &report_error);
707 var_set_print_format (v, &print);
708 var_set_write_format (v, &write);
710 /* Range missing values. */
711 mv_init (&miss, width);
714 double x = read_float (r);
715 double y = read_float (r);
716 mv_add_range (&miss, x, y);
718 else if (match (r, 'A'))
719 mv_add_range (&miss, read_float (r), HIGHEST);
720 else if (match (r, '9'))
721 mv_add_range (&miss, LOWEST, read_float (r));
723 /* Single missing values. */
724 while (match (r, '8'))
726 int mv_width = MIN (width, 8);
729 parse_value (r, mv_width, &value);
730 value_resize (&value, mv_width, width);
731 mv_add_value (&miss, &value);
732 value_destroy (&value, width);
735 var_set_missing_values (v, &miss);
741 read_string (r, label);
742 var_set_label (v, label);
746 if (weight_name != NULL)
748 struct variable *weight_var = dict_lookup_var (dict, weight_name);
749 if (weight_var == NULL)
750 error (r, _("Weighting variable %s not present in dictionary."),
753 dict_set_weight (dict, weight_var);
757 /* Parse a value of with WIDTH into value V. */
759 parse_value (struct pfm_reader *r, int width, union value *v)
761 value_init (v, width);
765 size_t n_bytes = read_bytes (r, buf);
766 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
769 v->f = read_float (r);
772 /* Parse a value label record and return success. */
774 read_value_label (struct pfm_reader *r, struct dictionary *dict)
786 v = pool_nalloc (r->pool, nv, sizeof *v);
787 for (i = 0; i < nv; i++)
790 read_string (r, name);
792 v[i] = dict_lookup_var (dict, name);
794 error (r, _("Unknown variable %s while parsing value labels."), name);
796 if (var_get_type (v[0]) != var_get_type (v[i]))
797 error (r, _("Cannot assign value labels to %s and %s, which "
798 "have different variable types."),
799 var_get_name (v[0]), var_get_name (v[i]));
802 n_labels = read_int (r);
803 for (i = 0; i < n_labels; i++)
809 parse_value (r, var_get_width (v[0]), &val);
810 read_string (r, label);
812 /* Assign the value label to each variable. */
813 for (j = 0; j < nv; j++)
814 var_replace_value_label (v[j], &val, label);
816 value_destroy (&val, var_get_width (v[0]));
820 /* Reads a set of documents from portable file R into DICT. */
822 read_documents (struct pfm_reader *r, struct dictionary *dict)
827 line_cnt = read_int (r);
828 for (i = 0; i < line_cnt; i++)
831 read_string (r, line);
832 dict_add_document_line (dict, line);
836 /* Reads and returns one case from portable file R. Returns a
837 null pointer on failure. */
838 static struct ccase *
839 por_file_casereader_read (struct casereader *reader, void *r_)
841 struct pfm_reader *r = r_;
842 struct ccase *volatile c;
845 c = case_create (r->proto);
846 setjmp (r->bail_out);
849 casereader_force_error (reader);
854 /* Check for end of file. */
861 for (i = 0; i < r->var_cnt; i++)
863 int width = caseproto_get_width (r->proto, i);
866 case_data_rw_idx (c, i)->f = read_float (r);
870 size_t n_bytes = read_bytes (r, buf);
871 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
878 /* Returns true if FILE is an SPSS portable file,
881 pfm_detect (FILE *file)
883 unsigned char header[464];
885 int cooked_cnt, raw_cnt, line_len;
888 cooked_cnt = raw_cnt = 0;
890 while (cooked_cnt < sizeof header)
893 if (c == EOF || raw_cnt++ > 512)
897 while (line_len < 80 && cooked_cnt < sizeof header)
899 header[cooked_cnt++] = ' ';
906 header[cooked_cnt++] = c;
911 memset (trans, 0, 256);
912 for (i = 64; i < 256; i++)
914 unsigned char c = header[i + 200];
916 trans[c] = portable_to_local[i];
919 for (i = 0; i < 8; i++)
920 if (trans[header[i + 456]] != "SPSSPORT"[i])
926 static const struct casereader_class por_file_casereader_class =
928 por_file_casereader_read,
929 por_file_casereader_destroy,