1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/por-file-reader.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/short-names.h"
38 #include "data/value-labels.h"
39 #include "data/variable.h"
40 #include "libpspp/compiler.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/pool.h"
45 #include "libpspp/str.h"
47 #include "gl/intprops.h"
48 #include "gl/minmax.h"
49 #include "gl/xalloc.h"
52 #define _(msgid) gettext (msgid)
53 #define N_(msgid) (msgid)
55 /* portable_to_local[PORTABLE] translates the given portable
56 character into the local character set. */
57 static const char portable_to_local[256] =
60 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
61 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
65 /* Portable file reader. */
68 struct pool *pool; /* All the portable file state. */
70 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct file_handle *fh; /* File handle. */
73 struct fh_lock *lock; /* Read lock for file. */
74 FILE *file; /* File stream. */
75 int line_length; /* Number of characters so far on this line. */
76 char cc; /* Current character. */
77 char *trans; /* 256-byte character set translation table. */
78 int var_cnt; /* Number of variables. */
79 int weight_index; /* 0-based index of weight variable, or -1. */
80 struct caseproto *proto; /* Format of output cases. */
81 bool ok; /* Set false on I/O error. */
84 static const struct casereader_class por_file_casereader_class;
87 error (struct pfm_reader *r, const char *msg,...)
91 /* Displays MSG as an error message and aborts reading the
92 portable file via longjmp(). */
94 error (struct pfm_reader *r, const char *msg, ...)
100 ds_init_empty (&text);
101 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
102 fh_get_file_name (r->fh), (long long int) ftello (r->file));
103 va_start (args, msg);
104 ds_put_vformat (&text, msg, args);
107 m.category = MSG_C_GENERAL;
108 m.severity = MSG_S_ERROR;
114 m.text = ds_cstr (&text);
120 longjmp (r->bail_out, 1);
123 /* Displays MSG as an warning for the current position in
124 portable file reader R. */
126 warning (struct pfm_reader *r, const char *msg, ...)
132 ds_init_empty (&text);
133 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
134 fh_get_file_name (r->fh), (long long int) ftello (r->file));
135 va_start (args, msg);
136 ds_put_vformat (&text, msg, args);
139 m.category = MSG_C_GENERAL;
140 m.severity = MSG_S_WARNING;
146 m.text = ds_cstr (&text);
151 /* Close and destroy R.
152 Returns false if an error was detected on R, true otherwise. */
154 close_reader (struct pfm_reader *r)
162 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
164 msg (ME, _("Error closing portable file `%s': %s."),
165 fh_get_file_name (r->fh), strerror (errno));
175 pool_destroy (r->pool);
180 /* Closes portable file reader R, after we're done with it. */
182 por_file_casereader_destroy (struct casereader *reader, void *r_)
184 struct pfm_reader *r = r_;
185 if (!close_reader (r))
186 casereader_force_error (reader);
189 /* Read a single character into cur_char. */
191 advance (struct pfm_reader *r)
195 /* Read the next character from the file.
196 Ignore carriage returns entirely.
197 Mostly ignore new-lines, but if a new-line occurs before the
198 line has reached 80 bytes in length, then treat the
199 "missing" bytes as spaces. */
202 while ((c = getc (r->file)) == '\r')
207 if (r->line_length < 80)
210 ungetc ('\n', r->file);
216 error (r, _("unexpected end of file"));
218 if (r->trans != NULL)
224 /* Skip a single character if present, and return whether it was
227 match (struct pfm_reader *r, int c)
238 static void read_header (struct pfm_reader *);
239 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
240 static void read_variables (struct pfm_reader *, struct dictionary *);
241 static void read_value_label (struct pfm_reader *, struct dictionary *);
242 static void read_documents (struct pfm_reader *, struct dictionary *);
244 /* Reads the dictionary from file with handle H, and returns it in a
245 dictionary structure. This dictionary may be modified in order to
246 rename, reorder, and delete variables, etc. */
248 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
249 struct pfm_read_info *info)
251 struct pool *volatile pool = NULL;
252 struct pfm_reader *volatile r = NULL;
254 *dict = dict_create (get_default_encoding ());
256 /* Create and initialize reader. */
257 pool = pool_create ();
258 r = pool_alloc (pool, sizeof *r);
264 r->weight_index = -1;
269 if (setjmp (r->bail_out))
273 /* TRANSLATORS: this fragment will be interpolated into
274 messages in fh_lock() that identify types of files. */
275 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
280 r->file = fn_open (fh_get_file_name (r->fh), "rb");
283 msg (ME, _("An error occurred while opening `%s' for reading "
284 "as a portable file: %s."),
285 fh_get_file_name (r->fh), strerror (errno));
289 /* Read header, version, date info, product id, variables. */
291 read_version_data (r, info);
292 read_variables (r, *dict);
294 /* Read value labels. */
295 while (match (r, 'D'))
296 read_value_label (r, *dict);
298 /* Read documents. */
300 read_documents (r, *dict);
302 /* Check that we've made it to the data. */
304 error (r, _("Data record expected."));
306 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
307 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
308 &por_file_casereader_class, r);
312 dict_destroy (*dict);
317 /* Returns the value of base-30 digit C,
318 or -1 if C is not a base-30 digit. */
320 base_30_value (unsigned char c)
322 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
323 const char *p = strchr (base_30_digits, c);
324 return p != NULL ? p - base_30_digits : -1;
327 /* Read a floating point value and return its value. */
329 read_float (struct pfm_reader *r)
333 bool got_dot = false; /* Seen a decimal point? */
334 bool got_digit = false; /* Seen any digits? */
335 bool negative = false; /* Number is negative? */
337 /* Skip leading spaces. */
338 while (match (r, ' '))
341 /* `*' indicates system-missing. */
344 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
348 negative = match (r, '-');
351 int digit = base_30_value (r->cc);
356 /* Make sure that multiplication by 30 will not overflow. */
357 if (num > DBL_MAX * (1. / 30.))
358 /* The value of the digit doesn't matter, since we have already
359 gotten as many digits as can be represented in a `double'.
360 This doesn't necessarily mean the result will overflow.
361 The exponent may reduce it to within range.
363 We just need to record that there was another
364 digit so that we can multiply by 10 later. */
367 num = (num * 30.0) + digit;
369 /* Keep track of the number of digits after the decimal point.
370 If we just divided by 30 here, we would lose precision. */
374 else if (!got_dot && r->cc == '.')
375 /* Record that we have found the decimal point. */
378 /* Any other character terminates the number. */
384 /* Check that we had some digits. */
386 error (r, _("Number expected."));
388 /* Get exponent if any. */
389 if (r->cc == '+' || r->cc == '-')
392 bool negative_exponent = r->cc == '-';
395 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
397 if (exp > LONG_MAX / 30)
402 exp = exp * 30 + digit;
405 /* We don't check whether there were actually any digits, but we
407 if (negative_exponent)
412 /* Numbers must end with `/'. */
414 error (r, _("Missing numeric terminator."));
416 /* Multiply `num' by 30 to the `exponent' power, checking for
419 num *= pow (30.0, (double) exponent);
420 else if (exponent > 0)
422 if (num > DBL_MAX * pow (30.0, (double) -exponent))
425 num *= pow (30.0, (double) exponent);
428 return negative ? -num : num;
431 /* Read an integer and return its value. */
433 read_int (struct pfm_reader *r)
435 double f = read_float (r);
436 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
437 error (r, _("Invalid integer."));
441 /* Reads a string into BUF, which must have room for 256
444 read_string (struct pfm_reader *r, char *buf)
446 int n = read_int (r);
447 if (n < 0 || n > 255)
448 error (r, _("Bad string length %d."), n);
459 /* Reads a string into BUF, which must have room for 256
461 Returns the number of bytes read.
464 read_bytes (struct pfm_reader *r, uint8_t *buf)
466 int n = read_int (r);
467 if (n < 0 || n > 255)
468 error (r, _("Bad string length %d."), n);
480 /* Reads a string and returns a copy of it allocated from R's
483 read_pool_string (struct pfm_reader *r)
486 read_string (r, string);
487 return pool_strdup (r->pool, string);
490 /* Reads the 464-byte file header. */
492 read_header (struct pfm_reader *r)
497 /* Read and ignore vanity splash strings. */
498 for (i = 0; i < 200; i++)
501 /* Skip the first 64 characters of the translation table.
502 We don't care about these. They are probably all set to
503 '0', marking them as untranslatable, and that would screw
504 up our actual translation of the real '0'. */
505 for (i = 0; i < 64; i++)
508 /* Read the rest of the translation table. */
509 trans = pool_malloc (r->pool, 256);
510 memset (trans, 0, 256);
519 trans[c] = portable_to_local[i];
522 /* Set up the translation table, then read the first
523 translated character. */
527 /* Skip and verify signature. */
528 for (i = 0; i < 8; i++)
529 if (!match (r, "SPSSPORT"[i]))
531 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
532 longjmp (r->bail_out, 1);
536 /* Reads the version and date info record, as well as product and
537 subproduct identification records if present. */
539 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
541 static const char empty_string[] = "";
543 const char *product, *author, *subproduct;
548 error (r, _("Unrecognized version code `%c'."), r->cc);
549 date = read_pool_string (r);
550 time = read_pool_string (r);
551 product = match (r, '1') ? read_pool_string (r) : empty_string;
552 author = match (r, '2') ? read_pool_string (r) : empty_string;
553 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
556 if (strlen (date) != 8)
557 error (r, _("Bad date string length %zu."), strlen (date));
558 if (strlen (time) != 6)
559 error (r, _("Bad time string length %zu."), strlen (time));
561 /* Save file info. */
565 for (i = 0; i < 8; i++)
567 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
568 info->creation_date[map[i]] = date[i];
570 info->creation_date[2] = info->creation_date[5] = ' ';
571 info->creation_date[10] = 0;
574 for (i = 0; i < 6; i++)
576 static const int map[] = {0, 1, 3, 4, 6, 7};
577 info->creation_time[map[i]] = time[i];
579 info->creation_time[2] = info->creation_time[5] = ' ';
580 info->creation_time[8] = 0;
583 str_copy_trunc (info->product, sizeof info->product, product);
584 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
588 /* Translates a format specification read from portable file R as
589 the three integers INTS into a normal format specifier FORMAT,
590 checking that the format is appropriate for variable V. */
591 static struct fmt_spec
592 convert_format (struct pfm_reader *r, const int portable_format[3],
593 struct variable *v, bool *report_error)
595 struct fmt_spec format;
598 if (!fmt_from_io (portable_format[0], &format.type))
601 warning (r, _("%s: Bad format specifier byte (%d). Variable "
602 "will be assigned a default format."),
603 var_get_name (v), portable_format[0]);
607 format.w = portable_format[1];
608 format.d = portable_format[2];
611 ok = (fmt_check_output (&format)
612 && fmt_check_width_compat (&format, var_get_width (v)));
619 char fmt_string[FMT_STRING_LEN_MAX + 1];
620 fmt_to_string (&format, fmt_string);
621 if (var_is_numeric (v))
622 warning (r, _("Numeric variable %s has invalid format "
624 var_get_name (v), fmt_string);
626 warning (r, _("String variable %s with width %d has "
627 "invalid format specifier %s."),
628 var_get_name (v), var_get_width (v), fmt_string);
636 *report_error = false;
637 return fmt_default_for_width (var_get_width (v));
640 static void parse_value (struct pfm_reader *, int width, union value *);
642 /* Read information on all the variables. */
644 read_variables (struct pfm_reader *r, struct dictionary *dict)
646 char *weight_name = NULL;
650 error (r, _("Expected variable count record."));
652 r->var_cnt = read_int (r);
654 error (r, _("Invalid number of variables %d."), r->var_cnt);
656 /* Purpose of this value is unknown. It is typically 161. */
661 weight_name = read_pool_string (r);
662 if (strlen (weight_name) > SHORT_NAME_LEN)
663 error (r, _("Weight variable name (%s) truncated."), weight_name);
666 for (i = 0; i < r->var_cnt; i++)
672 struct missing_values miss;
673 struct fmt_spec print, write;
674 bool report_error = true;
678 error (r, _("Expected variable record."));
680 width = read_int (r);
682 error (r, _("Invalid variable width %d."), width);
684 read_string (r, name);
685 for (j = 0; j < 6; j++)
686 fmt[j] = read_int (r);
688 if (!dict_id_is_valid (dict, name, false)
689 || *name == '#' || *name == '$')
690 error (r, _("Invalid variable name `%s' in position %d."), name, i);
691 str_uppercase (name);
693 if (width < 0 || width > 255)
694 error (r, _("Bad width %d for variable %s."), width, name);
696 v = dict_create_var (dict, name, width);
702 char try_name[8 + 1 + INT_STRLEN_BOUND (i) + 1];
703 sprintf (try_name, "%s_%lu", name, i);
704 v = dict_create_var (dict, try_name, width);
708 warning (r, _("Duplicate variable name %s in position %d renamed "
709 "to %s."), name, i, var_get_name (v));
712 print = convert_format (r, &fmt[0], v, &report_error);
713 write = convert_format (r, &fmt[3], v, &report_error);
714 var_set_print_format (v, &print);
715 var_set_write_format (v, &write);
717 /* Range missing values. */
718 mv_init (&miss, width);
721 double x = read_float (r);
722 double y = read_float (r);
723 mv_add_range (&miss, x, y);
725 else if (match (r, 'A'))
726 mv_add_range (&miss, read_float (r), HIGHEST);
727 else if (match (r, '9'))
728 mv_add_range (&miss, LOWEST, read_float (r));
730 /* Single missing values. */
731 while (match (r, '8'))
733 int mv_width = MIN (width, 8);
736 parse_value (r, mv_width, &value);
737 value_resize (&value, mv_width, width);
738 mv_add_value (&miss, &value);
739 value_destroy (&value, width);
742 var_set_missing_values (v, &miss);
748 read_string (r, label);
749 var_set_label (v, label, false); /* XXX */
753 if (weight_name != NULL)
755 struct variable *weight_var = dict_lookup_var (dict, weight_name);
756 if (weight_var == NULL)
757 error (r, _("Weighting variable %s not present in dictionary."),
760 dict_set_weight (dict, weight_var);
764 /* Parse a value of with WIDTH into value V. */
766 parse_value (struct pfm_reader *r, int width, union value *v)
768 value_init (v, width);
772 size_t n_bytes = read_bytes (r, buf);
773 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
776 v->f = read_float (r);
779 /* Parse a value label record and return success. */
781 read_value_label (struct pfm_reader *r, struct dictionary *dict)
793 v = pool_nalloc (r->pool, nv, sizeof *v);
794 for (i = 0; i < nv; i++)
797 read_string (r, name);
799 v[i] = dict_lookup_var (dict, name);
801 error (r, _("Unknown variable %s while parsing value labels."), name);
803 if (var_get_type (v[0]) != var_get_type (v[i]))
804 error (r, _("Cannot assign value labels to %s and %s, which "
805 "have different variable types."),
806 var_get_name (v[0]), var_get_name (v[i]));
809 n_labels = read_int (r);
810 for (i = 0; i < n_labels; i++)
816 parse_value (r, var_get_width (v[0]), &val);
817 read_string (r, label);
819 /* Assign the value label to each variable. */
820 for (j = 0; j < nv; j++)
821 var_replace_value_label (v[j], &val, label);
823 value_destroy (&val, var_get_width (v[0]));
827 /* Reads a set of documents from portable file R into DICT. */
829 read_documents (struct pfm_reader *r, struct dictionary *dict)
834 line_cnt = read_int (r);
835 for (i = 0; i < line_cnt; i++)
838 read_string (r, line);
839 dict_add_document_line (dict, line, false);
843 /* Reads and returns one case from portable file R. Returns a
844 null pointer on failure. */
845 static struct ccase *
846 por_file_casereader_read (struct casereader *reader, void *r_)
848 struct pfm_reader *r = r_;
849 struct ccase *volatile c;
852 c = case_create (r->proto);
853 setjmp (r->bail_out);
856 casereader_force_error (reader);
861 /* Check for end of file. */
868 for (i = 0; i < r->var_cnt; i++)
870 int width = caseproto_get_width (r->proto, i);
873 case_data_rw_idx (c, i)->f = read_float (r);
877 size_t n_bytes = read_bytes (r, buf);
878 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
885 /* Returns true if FILE is an SPSS portable file,
888 pfm_detect (FILE *file)
890 unsigned char header[464];
892 int cooked_cnt, raw_cnt, line_len;
895 cooked_cnt = raw_cnt = 0;
897 while (cooked_cnt < sizeof header)
900 if (c == EOF || raw_cnt++ > 512)
904 while (line_len < 80 && cooked_cnt < sizeof header)
906 header[cooked_cnt++] = ' ';
913 header[cooked_cnt++] = c;
918 memset (trans, 0, 256);
919 for (i = 64; i < 256; i++)
921 unsigned char c = header[i + 200];
923 trans[c] = portable_to_local[i];
926 for (i = 0; i < 8; i++)
927 if (trans[header[i + 456]] != "SPSSPORT"[i])
933 static const struct casereader_class por_file_casereader_class =
935 por_file_casereader_read,
936 por_file_casereader_destroy,