1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/por-file-reader.h"
30 #include "data/casereader-provider.h"
31 #include "data/casereader.h"
32 #include "data/dictionary.h"
33 #include "data/file-handle-def.h"
34 #include "data/file-name.h"
35 #include "data/format.h"
36 #include "data/missing-values.h"
37 #include "data/short-names.h"
38 #include "data/value-labels.h"
39 #include "data/variable.h"
40 #include "libpspp/compiler.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/pool.h"
45 #include "libpspp/str.h"
47 #include "gl/intprops.h"
48 #include "gl/minmax.h"
49 #include "gl/xalloc.h"
52 #define _(msgid) gettext (msgid)
53 #define N_(msgid) (msgid)
55 /* portable_to_local[PORTABLE] translates the given portable
56 character into the local character set. */
57 static const char portable_to_local[256] =
60 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
61 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
65 /* Portable file reader. */
68 struct pool *pool; /* All the portable file state. */
70 jmp_buf bail_out; /* longjmp() target for error handling. */
72 struct file_handle *fh; /* File handle. */
73 struct fh_lock *lock; /* Read lock for file. */
74 FILE *file; /* File stream. */
75 int line_length; /* Number of characters so far on this line. */
76 char cc; /* Current character. */
77 char *trans; /* 256-byte character set translation table. */
78 int var_cnt; /* Number of variables. */
79 int weight_index; /* 0-based index of weight variable, or -1. */
80 struct caseproto *proto; /* Format of output cases. */
81 bool ok; /* Set false on I/O error. */
84 static const struct casereader_class por_file_casereader_class;
87 error (struct pfm_reader *r, const char *msg,...)
91 /* Displays MSG as an error message and aborts reading the
92 portable file via longjmp(). */
94 error (struct pfm_reader *r, const char *msg, ...)
100 ds_init_empty (&text);
101 ds_put_format (&text, _("portable file %s corrupt at offset 0x%llx: "),
102 fh_get_file_name (r->fh), (long long int) ftello (r->file));
103 va_start (args, msg);
104 ds_put_vformat (&text, msg, args);
107 m.category = MSG_C_GENERAL;
108 m.severity = MSG_S_ERROR;
114 m.text = ds_cstr (&text);
120 longjmp (r->bail_out, 1);
123 /* Displays MSG as an warning for the current position in
124 portable file reader R. */
126 warning (struct pfm_reader *r, const char *msg, ...)
132 ds_init_empty (&text);
133 ds_put_format (&text, _("reading portable file %s at offset 0x%llx: "),
134 fh_get_file_name (r->fh), (long long int) ftello (r->file));
135 va_start (args, msg);
136 ds_put_vformat (&text, msg, args);
139 m.category = MSG_C_GENERAL;
140 m.severity = MSG_S_WARNING;
146 m.text = ds_cstr (&text);
151 /* Close and destroy R.
152 Returns false if an error was detected on R, true otherwise. */
154 close_reader (struct pfm_reader *r)
162 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
164 msg (ME, _("Error closing portable file `%s': %s."),
165 fh_get_file_name (r->fh), strerror (errno));
175 pool_destroy (r->pool);
180 /* Closes portable file reader R, after we're done with it. */
182 por_file_casereader_destroy (struct casereader *reader, void *r_)
184 struct pfm_reader *r = r_;
185 if (!close_reader (r))
186 casereader_force_error (reader);
189 /* Read a single character into cur_char. */
191 advance (struct pfm_reader *r)
195 /* Read the next character from the file.
196 Ignore carriage returns entirely.
197 Mostly ignore new-lines, but if a new-line occurs before the
198 line has reached 80 bytes in length, then treat the
199 "missing" bytes as spaces. */
202 while ((c = getc (r->file)) == '\r')
207 if (r->line_length < 80)
210 ungetc ('\n', r->file);
216 error (r, _("unexpected end of file"));
218 if (r->trans != NULL)
224 /* Skip a single character if present, and return whether it was
227 match (struct pfm_reader *r, int c)
238 static void read_header (struct pfm_reader *);
239 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
240 static void read_variables (struct pfm_reader *, struct dictionary *);
241 static void read_value_label (struct pfm_reader *, struct dictionary *);
242 static void read_documents (struct pfm_reader *, struct dictionary *);
244 /* Reads the dictionary from file with handle H, and returns it in a
245 dictionary structure. This dictionary may be modified in order to
246 rename, reorder, and delete variables, etc. */
248 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
249 struct pfm_read_info *info)
251 struct pool *volatile pool = NULL;
252 struct pfm_reader *volatile r = NULL;
254 *dict = dict_create (get_default_encoding ());
256 /* Create and initialize reader. */
257 pool = pool_create ();
258 r = pool_alloc (pool, sizeof *r);
264 r->weight_index = -1;
269 if (setjmp (r->bail_out))
273 /* TRANSLATORS: this fragment will be interpolated into
274 messages in fh_lock() that identify types of files. */
275 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
280 r->file = fn_open (fh_get_file_name (r->fh), "rb");
283 msg (ME, _("An error occurred while opening `%s' for reading "
284 "as a portable file: %s."),
285 fh_get_file_name (r->fh), strerror (errno));
289 /* Read header, version, date info, product id, variables. */
291 read_version_data (r, info);
292 read_variables (r, *dict);
294 /* Read value labels. */
295 while (match (r, 'D'))
296 read_value_label (r, *dict);
298 /* Read documents. */
300 read_documents (r, *dict);
302 /* Check that we've made it to the data. */
304 error (r, _("Data record expected."));
306 r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
307 return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX,
308 &por_file_casereader_class, r);
312 dict_destroy (*dict);
317 /* Returns the value of base-30 digit C,
318 or -1 if C is not a base-30 digit. */
320 base_30_value (unsigned char c)
322 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
323 const char *p = strchr (base_30_digits, c);
324 return p != NULL ? p - base_30_digits : -1;
327 /* Read a floating point value and return its value. */
329 read_float (struct pfm_reader *r)
333 bool got_dot = false; /* Seen a decimal point? */
334 bool got_digit = false; /* Seen any digits? */
335 bool negative = false; /* Number is negative? */
337 /* Skip leading spaces. */
338 while (match (r, ' '))
341 /* `*' indicates system-missing. */
344 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
348 negative = match (r, '-');
351 int digit = base_30_value (r->cc);
356 /* Make sure that multiplication by 30 will not overflow. */
357 if (num > DBL_MAX * (1. / 30.))
358 /* The value of the digit doesn't matter, since we have already
359 gotten as many digits as can be represented in a `double'.
360 This doesn't necessarily mean the result will overflow.
361 The exponent may reduce it to within range.
363 We just need to record that there was another
364 digit so that we can multiply by 10 later. */
367 num = (num * 30.0) + digit;
369 /* Keep track of the number of digits after the decimal point.
370 If we just divided by 30 here, we would lose precision. */
374 else if (!got_dot && r->cc == '.')
375 /* Record that we have found the decimal point. */
378 /* Any other character terminates the number. */
384 /* Check that we had some digits. */
386 error (r, _("Number expected."));
388 /* Get exponent if any. */
389 if (r->cc == '+' || r->cc == '-')
392 bool negative_exponent = r->cc == '-';
395 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
397 if (exp > LONG_MAX / 30)
402 exp = exp * 30 + digit;
405 /* We don't check whether there were actually any digits, but we
407 if (negative_exponent)
412 /* Numbers must end with `/'. */
414 error (r, _("Missing numeric terminator."));
416 /* Multiply `num' by 30 to the `exponent' power, checking for
419 num *= pow (30.0, (double) exponent);
420 else if (exponent > 0)
422 if (num > DBL_MAX * pow (30.0, (double) -exponent))
425 num *= pow (30.0, (double) exponent);
428 return negative ? -num : num;
431 /* Read an integer and return its value. */
433 read_int (struct pfm_reader *r)
435 double f = read_float (r);
436 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
437 error (r, _("Invalid integer."));
441 /* Reads a string into BUF, which must have room for 256
444 read_string (struct pfm_reader *r, char *buf)
446 int n = read_int (r);
447 if (n < 0 || n > 255)
448 error (r, _("Bad string length %d."), n);
459 /* Reads a string into BUF, which must have room for 256
461 Returns the number of bytes read.
464 read_bytes (struct pfm_reader *r, uint8_t *buf)
466 int n = read_int (r);
467 if (n < 0 || n > 255)
468 error (r, _("Bad string length %d."), n);
480 /* Reads a string and returns a copy of it allocated from R's
483 read_pool_string (struct pfm_reader *r)
486 read_string (r, string);
487 return pool_strdup (r->pool, string);
490 /* Reads the 464-byte file header. */
492 read_header (struct pfm_reader *r)
497 /* Read and ignore vanity splash strings. */
498 for (i = 0; i < 200; i++)
501 /* Skip the first 64 characters of the translation table.
502 We don't care about these. They are probably all set to
503 '0', marking them as untranslatable, and that would screw
504 up our actual translation of the real '0'. */
505 for (i = 0; i < 64; i++)
508 /* Read the rest of the translation table. */
509 trans = pool_malloc (r->pool, 256);
510 memset (trans, 0, 256);
519 trans[c] = portable_to_local[i];
522 /* Set up the translation table, then read the first
523 translated character. */
527 /* Skip and verify signature. */
528 for (i = 0; i < 8; i++)
529 if (!match (r, "SPSSPORT"[i]))
531 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
532 longjmp (r->bail_out, 1);
536 /* Reads the version and date info record, as well as product and
537 subproduct identification records if present. */
539 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
541 static const char empty_string[] = "";
543 const char *product, *subproduct;
548 error (r, _("Unrecognized version code `%c'."), r->cc);
549 date = read_pool_string (r);
550 time = read_pool_string (r);
551 product = match (r, '1') ? read_pool_string (r) : empty_string;
554 /* Skip "author" field. */
555 read_pool_string (r);
557 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
560 if (strlen (date) != 8)
561 error (r, _("Bad date string length %zu."), strlen (date));
562 if (strlen (time) != 6)
563 error (r, _("Bad time string length %zu."), strlen (time));
565 /* Save file info. */
569 for (i = 0; i < 8; i++)
571 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
572 info->creation_date[map[i]] = date[i];
574 info->creation_date[2] = info->creation_date[5] = ' ';
575 info->creation_date[10] = 0;
578 for (i = 0; i < 6; i++)
580 static const int map[] = {0, 1, 3, 4, 6, 7};
581 info->creation_time[map[i]] = time[i];
583 info->creation_time[2] = info->creation_time[5] = ' ';
584 info->creation_time[8] = 0;
587 str_copy_trunc (info->product, sizeof info->product, product);
588 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
592 /* Translates a format specification read from portable file R as
593 the three integers INTS into a normal format specifier FORMAT,
594 checking that the format is appropriate for variable V. */
595 static struct fmt_spec
596 convert_format (struct pfm_reader *r, const int portable_format[3],
597 struct variable *v, bool *report_error)
599 struct fmt_spec format;
602 if (!fmt_from_io (portable_format[0], &format.type))
605 warning (r, _("%s: Bad format specifier byte (%d). Variable "
606 "will be assigned a default format."),
607 var_get_name (v), portable_format[0]);
611 format.w = portable_format[1];
612 format.d = portable_format[2];
615 ok = (fmt_check_output (&format)
616 && fmt_check_width_compat (&format, var_get_width (v)));
623 char fmt_string[FMT_STRING_LEN_MAX + 1];
624 fmt_to_string (&format, fmt_string);
625 if (var_is_numeric (v))
626 warning (r, _("Numeric variable %s has invalid format "
628 var_get_name (v), fmt_string);
630 warning (r, _("String variable %s with width %d has "
631 "invalid format specifier %s."),
632 var_get_name (v), var_get_width (v), fmt_string);
640 *report_error = false;
641 return fmt_default_for_width (var_get_width (v));
644 static void parse_value (struct pfm_reader *, int width, union value *);
646 /* Read information on all the variables. */
648 read_variables (struct pfm_reader *r, struct dictionary *dict)
650 char *weight_name = NULL;
654 error (r, _("Expected variable count record."));
656 r->var_cnt = read_int (r);
658 error (r, _("Invalid number of variables %d."), r->var_cnt);
665 weight_name = read_pool_string (r);
666 if (strlen (weight_name) > SHORT_NAME_LEN)
667 error (r, _("Weight variable name (%s) truncated."), weight_name);
670 for (i = 0; i < r->var_cnt; i++)
676 struct missing_values miss;
677 struct fmt_spec print, write;
678 bool report_error = true;
682 error (r, _("Expected variable record."));
684 width = read_int (r);
686 error (r, _("Invalid variable width %d."), width);
688 read_string (r, name);
689 for (j = 0; j < 6; j++)
690 fmt[j] = read_int (r);
692 if (!dict_id_is_valid (dict, name, false)
693 || *name == '#' || *name == '$')
694 error (r, _("Invalid variable name `%s' in position %d."), name, i);
695 str_uppercase (name);
697 if (width < 0 || width > 255)
698 error (r, _("Bad width %d for variable %s."), width, name);
700 v = dict_create_var (dict, name, width);
706 char try_name[8 + 1 + INT_STRLEN_BOUND (i) + 1];
707 sprintf (try_name, "%s_%lu", name, i);
708 v = dict_create_var (dict, try_name, width);
712 warning (r, _("Duplicate variable name %s in position %d renamed "
713 "to %s."), name, i, var_get_name (v));
716 print = convert_format (r, &fmt[0], v, &report_error);
717 write = convert_format (r, &fmt[3], v, &report_error);
718 var_set_print_format (v, &print);
719 var_set_write_format (v, &write);
721 /* Range missing values. */
722 mv_init (&miss, width);
725 double x = read_float (r);
726 double y = read_float (r);
727 mv_add_range (&miss, x, y);
729 else if (match (r, 'A'))
730 mv_add_range (&miss, read_float (r), HIGHEST);
731 else if (match (r, '9'))
732 mv_add_range (&miss, LOWEST, read_float (r));
734 /* Single missing values. */
735 while (match (r, '8'))
737 int mv_width = MIN (width, 8);
740 parse_value (r, mv_width, &value);
741 value_resize (&value, mv_width, width);
742 mv_add_value (&miss, &value);
743 value_destroy (&value, width);
746 var_set_missing_values (v, &miss);
752 read_string (r, label);
753 var_set_label (v, label, false); /* XXX */
757 if (weight_name != NULL)
759 struct variable *weight_var = dict_lookup_var (dict, weight_name);
760 if (weight_var == NULL)
761 error (r, _("Weighting variable %s not present in dictionary."),
764 dict_set_weight (dict, weight_var);
768 /* Parse a value of with WIDTH into value V. */
770 parse_value (struct pfm_reader *r, int width, union value *v)
772 value_init (v, width);
776 size_t n_bytes = read_bytes (r, buf);
777 value_copy_buf_rpad (v, width, buf, n_bytes, ' ');
780 v->f = read_float (r);
783 /* Parse a value label record and return success. */
785 read_value_label (struct pfm_reader *r, struct dictionary *dict)
797 v = pool_nalloc (r->pool, nv, sizeof *v);
798 for (i = 0; i < nv; i++)
801 read_string (r, name);
803 v[i] = dict_lookup_var (dict, name);
805 error (r, _("Unknown variable %s while parsing value labels."), name);
807 if (var_get_type (v[0]) != var_get_type (v[i]))
808 error (r, _("Cannot assign value labels to %s and %s, which "
809 "have different variable types."),
810 var_get_name (v[0]), var_get_name (v[i]));
813 n_labels = read_int (r);
814 for (i = 0; i < n_labels; i++)
820 parse_value (r, var_get_width (v[0]), &val);
821 read_string (r, label);
823 /* Assign the value label to each variable. */
824 for (j = 0; j < nv; j++)
825 var_replace_value_label (v[j], &val, label);
827 value_destroy (&val, var_get_width (v[0]));
831 /* Reads a set of documents from portable file R into DICT. */
833 read_documents (struct pfm_reader *r, struct dictionary *dict)
838 line_cnt = read_int (r);
839 for (i = 0; i < line_cnt; i++)
842 read_string (r, line);
843 dict_add_document_line (dict, line, false);
847 /* Reads and returns one case from portable file R. Returns a
848 null pointer on failure. */
849 static struct ccase *
850 por_file_casereader_read (struct casereader *reader, void *r_)
852 struct pfm_reader *r = r_;
853 struct ccase *volatile c;
856 c = case_create (r->proto);
857 setjmp (r->bail_out);
860 casereader_force_error (reader);
865 /* Check for end of file. */
872 for (i = 0; i < r->var_cnt; i++)
874 int width = caseproto_get_width (r->proto, i);
877 case_data_rw_idx (c, i)->f = read_float (r);
881 size_t n_bytes = read_bytes (r, buf);
882 u8_buf_copy_rpad (case_str_rw_idx (c, i), width, buf, n_bytes, ' ');
889 /* Returns true if FILE is an SPSS portable file,
892 pfm_detect (FILE *file)
894 unsigned char header[464];
896 int cooked_cnt, raw_cnt, line_len;
899 cooked_cnt = raw_cnt = 0;
901 while (cooked_cnt < sizeof header)
904 if (c == EOF || raw_cnt++ > 512)
908 while (line_len < 80 && cooked_cnt < sizeof header)
910 header[cooked_cnt++] = ' ';
917 header[cooked_cnt++] = c;
922 memset (trans, 0, 256);
923 for (i = 64; i < 256; i++)
925 unsigned char c = header[i + 200];
927 trans[c] = portable_to_local[i];
930 for (i = 0; i < 8; i++)
931 if (trans[header[i + 456]] != "SPSSPORT"[i])
937 static const struct casereader_class por_file_casereader_class =
939 por_file_casereader_read,
940 por_file_casereader_destroy,