1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/short-names.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <libpspp/misc.h>
42 #include <libpspp/pool.h>
43 #include <libpspp/str.h>
45 #include "gl/intprops.h"
46 #include "gl/minmax.h"
47 #include "gl/xalloc.h"
50 #define _(msgid) gettext (msgid)
51 #define N_(msgid) (msgid)
53 /* portable_to_local[PORTABLE] translates the given portable
54 character into the local character set. */
55 static const char portable_to_local[256] =
58 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
59 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
63 /* Portable file reader. */
66 struct pool *pool; /* All the portable file state. */
68 jmp_buf bail_out; /* longjmp() target for error handling. */
70 struct file_handle *fh; /* File handle. */
71 struct fh_lock *lock; /* Read lock for file. */
72 FILE *file; /* File stream. */
73 int line_length; /* Number of characters so far on this line. */
74 char cc; /* Current character. */
75 char *trans; /* 256-byte character set translation table. */
76 int var_cnt; /* Number of variables. */
77 int weight_index; /* 0-based index of weight variable, or -1. */
78 struct caseproto *proto; /* Format of output cases. */
79 bool ok; /* Set false on I/O error. */
82 static const struct casereader_class por_file_casereader_class;
85 error (struct pfm_reader *r, const char *msg,...)
89 /* Displays MSG as an error message and aborts reading the
90 portable file via longjmp(). */
92 error (struct pfm_reader *r, const char *msg, ...)
98 ds_init_empty (&text);
99 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
100 fh_get_file_name (r->fh), (long long int) ftello (r->file));
101 va_start (args, msg);
102 ds_put_vformat (&text, msg, args);
105 m.category = MSG_C_GENERAL;
106 m.severity = MSG_S_ERROR;
107 m.where.file_name = NULL;
108 m.where.line_number = 0;
109 m.where.first_column = 0;
110 m.where.last_column = 0;
111 m.text = ds_cstr (&text);
117 longjmp (r->bail_out, 1);
120 /* Displays MSG as an warning for the current position in
121 portable file reader R. */
123 warning (struct pfm_reader *r, const char *msg, ...)
129 ds_init_empty (&text);
130 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
131 fh_get_file_name (r->fh), (long long int) ftello (r->file));
132 va_start (args, msg);
133 ds_put_vformat (&text, msg, args);
136 m.category = MSG_C_GENERAL;
137 m.severity = MSG_S_WARNING;
138 m.where.file_name = NULL;
139 m.where.line_number = 0;
140 m.where.first_column = 0;
141 m.where.last_column = 0;
142 m.text = ds_cstr (&text);
147 /* Close and destroy R.
148 Returns false if an error was detected on R, true otherwise. */
150 close_reader (struct pfm_reader *r)
158 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
160 msg (ME, _("Error closing portable file `%s': %s."),
161 fh_get_file_name (r->fh), strerror (errno));
171 pool_destroy (r->pool);
176 /* Closes portable file reader R, after we're done with it. */
178 por_file_casereader_destroy (struct casereader *reader, void *r_)
180 struct pfm_reader *r = r_;
181 if (!close_reader (r))
182 casereader_force_error (reader);
185 /* Read a single character into cur_char. */
187 advance (struct pfm_reader *r)
191 /* Read the next character from the file.
192 Ignore carriage returns entirely.
193 Mostly ignore new-lines, but if a new-line occurs before the
194 line has reached 80 bytes in length, then treat the
195 "missing" bytes as spaces. */
198 while ((c = getc (r->file)) == '\r')
203 if (r->line_length < 80)
206 ungetc ('\n', r->file);
212 error (r, _("unexpected end of file"));
214 if (r->trans != NULL)
220 /* Skip a single character if present, and return whether it was
223 match (struct pfm_reader *r, int c)
234 static void read_header (struct pfm_reader *);
235 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
236 static void read_variables (struct pfm_reader *, struct dictionary *);
237 static void read_value_label (struct pfm_reader *, struct dictionary *);
238 static void read_documents (struct pfm_reader *, struct dictionary *);
240 /* Reads the dictionary from file with handle H, and returns it in a
241 dictionary structure. This dictionary may be modified in order to
242 rename, reorder, and delete variables, etc. */
244 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
245 struct pfm_read_info *info)
247 struct pool *volatile pool = NULL;
248 struct pfm_reader *volatile r = NULL;
250 *dict = dict_create ();
252 /* Create and initialize reader. */
253 pool = pool_create ();
254 r = pool_alloc (pool, sizeof *r);
260 r->weight_index = -1;
265 if (setjmp (r->bail_out))
269 /* TRANSLATORS: this fragment will be interpolated into
270 messages in fh_lock() that identify types of files. */
271 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
276 r->file = fn_open (fh_get_file_name (r->fh), "rb");
279 msg (ME, _("An error occurred while opening `%s' for reading "
280 "as a portable file: %s."),
281 fh_get_file_name (r->fh), strerror (errno));
285 /* Read header, version, date info, product id, variables. */
287 read_version_data (r, info);
288 read_variables (r, *dict);
290 /* Read value labels. */
291 while (match (r, 'D'))
292 read_value_label (r, *dict);
294 /* Read documents. */
296 read_documents (r, *dict);
298 /* Check that we've made it to the data. */
300 error (r, _("Data record expected."));
302 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
303 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
304 &por_file_casereader_class, r);
308 dict_destroy (*dict);
313 /* Returns the value of base-30 digit C,
314 or -1 if C is not a base-30 digit. */
316 base_30_value (unsigned char c)
318 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
319 const char *p = strchr (base_30_digits, c);
320 return p != NULL ? p - base_30_digits : -1;
323 /* Read a floating point value and return its value. */
325 read_float (struct pfm_reader *r)
329 bool got_dot = false; /* Seen a decimal point? */
330 bool got_digit = false; /* Seen any digits? */
331 bool negative = false; /* Number is negative? */
333 /* Skip leading spaces. */
334 while (match (r, ' '))
337 /* `*' indicates system-missing. */
340 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
344 negative = match (r, '-');
347 int digit = base_30_value (r->cc);
352 /* Make sure that multiplication by 30 will not overflow. */
353 if (num > DBL_MAX * (1. / 30.))
354 /* The value of the digit doesn't matter, since we have already
355 gotten as many digits as can be represented in a `double'.
356 This doesn't necessarily mean the result will overflow.
357 The exponent may reduce it to within range.
359 We just need to record that there was another
360 digit so that we can multiply by 10 later. */
363 num = (num * 30.0) + digit;
365 /* Keep track of the number of digits after the decimal point.
366 If we just divided by 30 here, we would lose precision. */
370 else if (!got_dot && r->cc == '.')
371 /* Record that we have found the decimal point. */
374 /* Any other character terminates the number. */
380 /* Check that we had some digits. */
382 error (r, _("Number expected."));
384 /* Get exponent if any. */
385 if (r->cc == '+' || r->cc == '-')
388 bool negative_exponent = r->cc == '-';
391 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
393 if (exp > LONG_MAX / 30)
398 exp = exp * 30 + digit;
401 /* We don't check whether there were actually any digits, but we
403 if (negative_exponent)
408 /* Numbers must end with `/'. */
410 error (r, _("Missing numeric terminator."));
412 /* Multiply `num' by 30 to the `exponent' power, checking for
415 num *= pow (30.0, (double) exponent);
416 else if (exponent > 0)
418 if (num > DBL_MAX * pow (30.0, (double) -exponent))
421 num *= pow (30.0, (double) exponent);
424 return negative ? -num : num;
427 /* Read an integer and return its value. */
429 read_int (struct pfm_reader *r)
431 double f = read_float (r);
432 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
433 error (r, _("Invalid integer."));
437 /* Reads a string into BUF, which must have room for 256
440 read_string (struct pfm_reader *r, char *buf)
442 int n = read_int (r);
443 if (n < 0 || n > 255)
444 error (r, _("Bad string length %d."), n);
455 /* Reads a string into BUF, which must have room for 256
457 Returns the number of bytes read.
460 read_bytes (struct pfm_reader *r, uint8_t *buf)
462 int n = read_int (r);
463 if (n < 0 || n > 255)
464 error (r, _("Bad string length %d."), n);
476 /* Reads a string and returns a copy of it allocated from R's
479 read_pool_string (struct pfm_reader *r)
482 read_string (r, string);
483 return pool_strdup (r->pool, string);
486 /* Reads the 464-byte file header. */
488 read_header (struct pfm_reader *r)
493 /* Read and ignore vanity splash strings. */
494 for (i = 0; i < 200; i++)
497 /* Skip the first 64 characters of the translation table.
498 We don't care about these. They are probably all set to
499 '0', marking them as untranslatable, and that would screw
500 up our actual translation of the real '0'. */
501 for (i = 0; i < 64; i++)
504 /* Read the rest of the translation table. */
505 trans = pool_malloc (r->pool, 256);
506 memset (trans, 0, 256);
515 trans[c] = portable_to_local[i];
518 /* Set up the translation table, then read the first
519 translated character. */
523 /* Skip and verify signature. */
524 for (i = 0; i < 8; i++)
525 if (!match (r, "SPSSPORT"[i]))
527 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
528 longjmp (r->bail_out, 1);
532 /* Reads the version and date info record, as well as product and
533 subproduct identification records if present. */
535 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
537 static const char empty_string[] = "";
539 const char *product, *author, *subproduct;
544 error (r, _("Unrecognized version code `%c'."), r->cc);
545 date = read_pool_string (r);
546 time = read_pool_string (r);
547 product = match (r, '1') ? read_pool_string (r) : empty_string;
548 author = match (r, '2') ? read_pool_string (r) : empty_string;
549 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
552 if (strlen (date) != 8)
553 error (r, _("Bad date string length %zu."), strlen (date));
554 if (strlen (time) != 6)
555 error (r, _("Bad time string length %zu."), strlen (time));
557 /* Save file info. */
561 for (i = 0; i < 8; i++)
563 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
564 info->creation_date[map[i]] = date[i];
566 info->creation_date[2] = info->creation_date[5] = ' ';
567 info->creation_date[10] = 0;
570 for (i = 0; i < 6; i++)
572 static const int map[] = {0, 1, 3, 4, 6, 7};
573 info->creation_time[map[i]] = time[i];
575 info->creation_time[2] = info->creation_time[5] = ' ';
576 info->creation_time[8] = 0;
579 str_copy_trunc (info->product, sizeof info->product, product);
580 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
584 /* Translates a format specification read from portable file R as
585 the three integers INTS into a normal format specifier FORMAT,
586 checking that the format is appropriate for variable V. */
587 static struct fmt_spec
588 convert_format (struct pfm_reader *r, const int portable_format[3],
589 struct variable *v, bool *report_error)
591 struct fmt_spec format;
594 if (!fmt_from_io (portable_format[0], &format.type))
597 warning (r, _("%s: Bad format specifier byte (%d). Variable "
598 "will be assigned a default format."),
599 var_get_name (v), portable_format[0]);
603 format.w = portable_format[1];
604 format.d = portable_format[2];
607 ok = (fmt_check_output (&format)
608 && fmt_check_width_compat (&format, var_get_width (v)));
615 char fmt_string[FMT_STRING_LEN_MAX + 1];
616 fmt_to_string (&format, fmt_string);
617 if (var_is_numeric (v))
618 warning (r, _("Numeric variable %s has invalid format "
620 var_get_name (v), fmt_string);
622 warning (r, _("String variable %s with width %d has "
623 "invalid format specifier %s."),
624 var_get_name (v), var_get_width (v), fmt_string);
632 *report_error = false;
633 return fmt_default_for_width (var_get_width (v));
636 static void parse_value (struct pfm_reader *, int width, union value *);
638 /* Read information on all the variables. */
640 read_variables (struct pfm_reader *r, struct dictionary *dict)
642 char *weight_name = NULL;
646 error (r, _("Expected variable count record."));
648 r->var_cnt = read_int (r);
650 error (r, _("Invalid number of variables %d."), r->var_cnt);
652 /* Purpose of this value is unknown. It is typically 161. */
657 weight_name = read_pool_string (r);
658 if (strlen (weight_name) > SHORT_NAME_LEN)
659 error (r, _("Weight variable name (%s) truncated."), weight_name);
662 for (i = 0; i < r->var_cnt; i++)
668 struct missing_values miss;
669 struct fmt_spec print, write;
670 bool report_error = true;
674 error (r, _("Expected variable record."));
676 width = read_int (r);
678 error (r, _("Invalid variable width %d."), width);
680 read_string (r, name);
681 for (j = 0; j < 6; j++)
682 fmt[j] = read_int (r);
684 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
685 error (r, _("Invalid variable name `%s' in position %d."), name, i);
686 str_uppercase (name);
688 if (width < 0 || width > 255)
689 error (r, _("Bad width %d for variable %s."), width, name);
691 v = dict_create_var (dict, name, width);
697 char try_name[8 + 1 + INT_STRLEN_BOUND (i) + 1];
698 sprintf (try_name, "%s_%lu", name, i);
699 v = dict_create_var (dict, try_name, width);
703 warning (r, _("Duplicate variable name %s in position %d renamed "
704 "to %s."), name, i, var_get_name (v));
707 print = convert_format (r, &fmt[0], v, &report_error);
708 write = convert_format (r, &fmt[3], v, &report_error);
709 var_set_print_format (v, &print);
710 var_set_write_format (v, &write);
712 /* Range missing values. */
713 mv_init (&miss, width);
716 double x = read_float (r);
717 double y = read_float (r);
718 mv_add_range (&miss, x, y);
720 else if (match (r, 'A'))
721 mv_add_range (&miss, read_float (r), HIGHEST);
722 else if (match (r, '9'))
723 mv_add_range (&miss, LOWEST, read_float (r));
725 /* Single missing values. */
726 while (match (r, '8'))
728 int mv_width = MIN (width, 8);
731 parse_value (r, mv_width, &value);
732 value_resize (&value, mv_width, width);
733 mv_add_value (&miss, &value);
734 value_destroy (&value, width);
737 var_set_missing_values (v, &miss);
743 read_string (r, label);
744 var_set_label (v, label);
748 if (weight_name != NULL)
750 struct variable *weight_var = dict_lookup_var (dict, weight_name);
751 if (weight_var == NULL)
752 error (r, _("Weighting variable %s not present in dictionary."),
755 dict_set_weight (dict, weight_var);
759 /* Parse a value of with WIDTH into value V. */
761 parse_value (struct pfm_reader *r, int width, union value *v)
763 value_init (v, width);
767 size_t n_bytes = read_bytes (r, buf);
768 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
771 v->f = read_float (r);
774 /* Parse a value label record and return success. */
776 read_value_label (struct pfm_reader *r, struct dictionary *dict)
788 v = pool_nalloc (r->pool, nv, sizeof *v);
789 for (i = 0; i < nv; i++)
792 read_string (r, name);
794 v[i] = dict_lookup_var (dict, name);
796 error (r, _("Unknown variable %s while parsing value labels."), name);
798 if (var_get_type (v[0]) != var_get_type (v[i]))
799 error (r, _("Cannot assign value labels to %s and %s, which "
800 "have different variable types."),
801 var_get_name (v[0]), var_get_name (v[i]));
804 n_labels = read_int (r);
805 for (i = 0; i < n_labels; i++)
811 parse_value (r, var_get_width (v[0]), &val);
812 read_string (r, label);
814 /* Assign the value label to each variable. */
815 for (j = 0; j < nv; j++)
816 var_replace_value_label (v[j], &val, label);
818 value_destroy (&val, var_get_width (v[0]));
822 /* Reads a set of documents from portable file R into DICT. */
824 read_documents (struct pfm_reader *r, struct dictionary *dict)
829 line_cnt = read_int (r);
830 for (i = 0; i < line_cnt; i++)
833 read_string (r, line);
834 dict_add_document_line (dict, line);
838 /* Reads and returns one case from portable file R. Returns a
839 null pointer on failure. */
840 static struct ccase *
841 por_file_casereader_read (struct casereader *reader, void *r_)
843 struct pfm_reader *r = r_;
844 struct ccase *volatile c;
847 c = case_create (r->proto);
848 setjmp (r->bail_out);
851 casereader_force_error (reader);
856 /* Check for end of file. */
863 for (i = 0; i < r->var_cnt; i++)
865 int width = caseproto_get_width (r->proto, i);
868 case_data_rw_idx (c, i)->f = read_float (r);
872 size_t n_bytes = read_bytes (r, buf);
873 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
880 /* Returns true if FILE is an SPSS portable file,
883 pfm_detect (FILE *file)
885 unsigned char header[464];
887 int cooked_cnt, raw_cnt, line_len;
890 cooked_cnt = raw_cnt = 0;
892 while (cooked_cnt < sizeof header)
895 if (c == EOF || raw_cnt++ > 512)
899 while (line_len < 80 && cooked_cnt < sizeof header)
901 header[cooked_cnt++] = ' ';
908 header[cooked_cnt++] = c;
913 memset (trans, 0, 256);
914 for (i = 64; i < 256; i++)
916 unsigned char c = header[i + 200];
918 trans[c] = portable_to_local[i];
921 for (i = 0; i < 8; i++)
922 if (trans[header[i + 456]] != "SPSSPORT"[i])
928 static const struct casereader_class por_file_casereader_class =
930 por_file_casereader_read,
931 por_file_casereader_destroy,