1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/short-names.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/message.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/pool.h>
44 #include <libpspp/str.h>
49 #define _(msgid) gettext (msgid)
50 #define N_(msgid) (msgid)
52 /* portable_to_local[PORTABLE] translates the given portable
53 character into the local character set. */
54 static const char portable_to_local[256] =
57 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
58 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
62 /* Portable file reader. */
65 struct pool *pool; /* All the portable file state. */
67 jmp_buf bail_out; /* longjmp() target for error handling. */
69 struct file_handle *fh; /* File handle. */
70 struct fh_lock *lock; /* Read lock for file. */
71 FILE *file; /* File stream. */
72 int line_length; /* Number of characters so far on this line. */
73 char cc; /* Current character. */
74 char *trans; /* 256-byte character set translation table. */
75 int var_cnt; /* Number of variables. */
76 int weight_index; /* 0-based index of weight variable, or -1. */
77 int *widths; /* Variable widths, 0 for numeric. */
78 size_t value_cnt; /* Number of `value's per case. */
79 bool ok; /* Set false on I/O error. */
82 static const struct casereader_class por_file_casereader_class;
85 error (struct pfm_reader *r, const char *msg,...)
89 /* Displays MSG as an error message and aborts reading the
90 portable file via longjmp(). */
92 error (struct pfm_reader *r, const char *msg, ...)
98 ds_init_empty (&text);
99 ds_put_format (&text, _("portable file %s corrupt at offset 0x%lx: "),
100 fh_get_file_name (r->fh), ftell (r->file));
101 va_start (args, msg);
102 ds_put_vformat (&text, msg, args);
105 m.category = MSG_GENERAL;
106 m.severity = MSG_ERROR;
107 m.where.file_name = NULL;
108 m.where.line_number = 0;
109 m.text = ds_cstr (&text);
115 longjmp (r->bail_out, 1);
118 /* Displays MSG as an warning for the current position in
119 portable file reader R. */
121 warning (struct pfm_reader *r, const char *msg, ...)
127 ds_init_empty (&text);
128 ds_put_format (&text, _("reading portable file %s at offset 0x%lx: "),
129 fh_get_file_name (r->fh), ftell (r->file));
130 va_start (args, msg);
131 ds_put_vformat (&text, msg, args);
134 m.category = MSG_GENERAL;
135 m.severity = MSG_WARNING;
136 m.where.file_name = NULL;
137 m.where.line_number = 0;
138 m.text = ds_cstr (&text);
143 /* Close and destroy R.
144 Returns false if an error was detected on R, true otherwise. */
146 close_reader (struct pfm_reader *r)
154 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
156 msg (ME, _("Error closing portable file \"%s\": %s."),
157 fh_get_file_name (r->fh), strerror (errno));
167 pool_destroy (r->pool);
172 /* Closes portable file reader R, after we're done with it. */
174 por_file_casereader_destroy (struct casereader *reader, void *r_)
176 struct pfm_reader *r = r_;
177 if (!close_reader (r))
178 casereader_force_error (reader);
181 /* Read a single character into cur_char. */
183 advance (struct pfm_reader *r)
187 /* Read the next character from the file.
188 Ignore carriage returns entirely.
189 Mostly ignore new-lines, but if a new-line occurs before the
190 line has reached 80 bytes in length, then treat the
191 "missing" bytes as spaces. */
194 while ((c = getc (r->file)) == '\r')
199 if (r->line_length < 80)
202 ungetc ('\n', r->file);
208 error (r, _("unexpected end of file"));
210 if (r->trans != NULL)
216 /* Skip a single character if present, and return whether it was
219 match (struct pfm_reader *r, int c)
230 static void read_header (struct pfm_reader *);
231 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
232 static void read_variables (struct pfm_reader *, struct dictionary *);
233 static void read_value_label (struct pfm_reader *, struct dictionary *);
234 static void read_documents (struct pfm_reader *, struct dictionary *);
236 /* Reads the dictionary from file with handle H, and returns it in a
237 dictionary structure. This dictionary may be modified in order to
238 rename, reorder, and delete variables, etc. */
240 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
241 struct pfm_read_info *info)
243 struct pool *volatile pool = NULL;
244 struct pfm_reader *volatile r = NULL;
246 *dict = dict_create ();
248 /* Create and initialize reader. */
249 pool = pool_create ();
250 r = pool_alloc (pool, sizeof *r);
256 r->weight_index = -1;
262 if (setjmp (r->bail_out))
266 /* TRANSLATORS: this fragment will be interpolated into
267 messages in fh_lock() that identify types of files. */
268 r->lock = fh_lock (fh, FH_REF_FILE, N_("portable file"), FH_ACC_READ, false);
273 r->file = fn_open (fh_get_file_name (r->fh), "rb");
276 msg (ME, _("An error occurred while opening \"%s\" for reading "
277 "as a portable file: %s."),
278 fh_get_file_name (r->fh), strerror (errno));
282 /* Read header, version, date info, product id, variables. */
284 read_version_data (r, info);
285 read_variables (r, *dict);
287 /* Read value labels. */
288 while (match (r, 'D'))
289 read_value_label (r, *dict);
291 /* Read documents. */
293 read_documents (r, *dict);
295 /* Check that we've made it to the data. */
297 error (r, _("Data record expected."));
299 r->value_cnt = dict_get_next_value_idx (*dict);
300 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
301 &por_file_casereader_class, r);
305 dict_destroy (*dict);
310 /* Returns the value of base-30 digit C,
311 or -1 if C is not a base-30 digit. */
313 base_30_value (unsigned char c)
315 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
316 const char *p = strchr (base_30_digits, c);
317 return p != NULL ? p - base_30_digits : -1;
320 /* Read a floating point value and return its value. */
322 read_float (struct pfm_reader *r)
326 bool got_dot = false; /* Seen a decimal point? */
327 bool got_digit = false; /* Seen any digits? */
328 bool negative = false; /* Number is negative? */
330 /* Skip leading spaces. */
331 while (match (r, ' '))
334 /* `*' indicates system-missing. */
337 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
341 negative = match (r, '-');
344 int digit = base_30_value (r->cc);
349 /* Make sure that multiplication by 30 will not overflow. */
350 if (num > DBL_MAX * (1. / 30.))
351 /* The value of the digit doesn't matter, since we have already
352 gotten as many digits as can be represented in a `double'.
353 This doesn't necessarily mean the result will overflow.
354 The exponent may reduce it to within range.
356 We just need to record that there was another
357 digit so that we can multiply by 10 later. */
360 num = (num * 30.0) + digit;
362 /* Keep track of the number of digits after the decimal point.
363 If we just divided by 30 here, we would lose precision. */
367 else if (!got_dot && r->cc == '.')
368 /* Record that we have found the decimal point. */
371 /* Any other character terminates the number. */
377 /* Check that we had some digits. */
379 error (r, _("Number expected."));
381 /* Get exponent if any. */
382 if (r->cc == '+' || r->cc == '-')
385 bool negative_exponent = r->cc == '-';
388 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
390 if (exp > LONG_MAX / 30)
395 exp = exp * 30 + digit;
398 /* We don't check whether there were actually any digits, but we
400 if (negative_exponent)
405 /* Numbers must end with `/'. */
407 error (r, _("Missing numeric terminator."));
409 /* Multiply `num' by 30 to the `exponent' power, checking for
412 num *= pow (30.0, (double) exponent);
413 else if (exponent > 0)
415 if (num > DBL_MAX * pow (30.0, (double) -exponent))
418 num *= pow (30.0, (double) exponent);
421 return negative ? -num : num;
424 /* Read an integer and return its value. */
426 read_int (struct pfm_reader *r)
428 double f = read_float (r);
429 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
430 error (r, _("Invalid integer."));
434 /* Reads a string into BUF, which must have room for 256
437 read_string (struct pfm_reader *r, char *buf)
439 int n = read_int (r);
440 if (n < 0 || n > 255)
441 error (r, _("Bad string length %d."), n);
451 /* Reads a string and returns a copy of it allocated from R's
454 read_pool_string (struct pfm_reader *r)
457 read_string (r, string);
458 return pool_strdup (r->pool, string);
461 /* Reads the 464-byte file header. */
463 read_header (struct pfm_reader *r)
468 /* Read and ignore vanity splash strings. */
469 for (i = 0; i < 200; i++)
472 /* Skip the first 64 characters of the translation table.
473 We don't care about these. They are probably all set to
474 '0', marking them as untranslatable, and that would screw
475 up our actual translation of the real '0'. */
476 for (i = 0; i < 64; i++)
479 /* Read the rest of the translation table. */
480 trans = pool_malloc (r->pool, 256);
481 memset (trans, 0, 256);
490 trans[c] = portable_to_local[i];
493 /* Set up the translation table, then read the first
494 translated character. */
498 /* Skip and verify signature. */
499 for (i = 0; i < 8; i++)
500 if (!match (r, "SPSSPORT"[i]))
502 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
503 longjmp (r->bail_out, 1);
507 /* Reads the version and date info record, as well as product and
508 subproduct identification records if present. */
510 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
512 static const char empty_string[] = "";
514 const char *product, *author, *subproduct;
519 error (r, _("Unrecognized version code `%c'."), r->cc);
520 date = read_pool_string (r);
521 time = read_pool_string (r);
522 product = match (r, '1') ? read_pool_string (r) : empty_string;
523 author = match (r, '2') ? read_pool_string (r) : empty_string;
524 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
527 if (strlen (date) != 8)
528 error (r, _("Bad date string length %zu."), strlen (date));
529 if (strlen (time) != 6)
530 error (r, _("Bad time string length %zu."), strlen (time));
532 /* Save file info. */
536 for (i = 0; i < 8; i++)
538 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
539 info->creation_date[map[i]] = date[i];
541 info->creation_date[2] = info->creation_date[5] = ' ';
542 info->creation_date[10] = 0;
545 for (i = 0; i < 6; i++)
547 static const int map[] = {0, 1, 3, 4, 6, 7};
548 info->creation_time[map[i]] = time[i];
550 info->creation_time[2] = info->creation_time[5] = ' ';
551 info->creation_time[8] = 0;
554 str_copy_trunc (info->product, sizeof info->product, product);
555 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
559 /* Translates a format specification read from portable file R as
560 the three integers INTS into a normal format specifier FORMAT,
561 checking that the format is appropriate for variable V. */
562 static struct fmt_spec
563 convert_format (struct pfm_reader *r, const int portable_format[3],
564 struct variable *v, bool *report_error)
566 struct fmt_spec format;
569 if (!fmt_from_io (portable_format[0], &format.type))
572 warning (r, _("%s: Bad format specifier byte (%d). Variable "
573 "will be assigned a default format."),
574 var_get_name (v), portable_format[0]);
578 format.w = portable_format[1];
579 format.d = portable_format[2];
582 ok = (fmt_check_output (&format)
583 && fmt_check_width_compat (&format, var_get_width (v)));
590 char fmt_string[FMT_STRING_LEN_MAX + 1];
591 fmt_to_string (&format, fmt_string);
592 if (var_is_numeric (v))
593 warning (r, _("Numeric variable %s has invalid format "
595 var_get_name (v), fmt_string);
597 warning (r, _("String variable %s with width %d has "
598 "invalid format specifier %s."),
599 var_get_name (v), var_get_width (v), fmt_string);
607 *report_error = false;
608 return fmt_default_for_width (var_get_width (v));
611 static union value parse_value (struct pfm_reader *, struct variable *);
613 /* Read information on all the variables. */
615 read_variables (struct pfm_reader *r, struct dictionary *dict)
617 char *weight_name = NULL;
621 error (r, _("Expected variable count record."));
623 r->var_cnt = read_int (r);
625 error (r, _("Invalid number of variables %d."), r->var_cnt);
626 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
628 /* Purpose of this value is unknown. It is typically 161. */
633 weight_name = read_pool_string (r);
634 if (strlen (weight_name) > SHORT_NAME_LEN)
635 error (r, _("Weight variable name (%s) truncated."), weight_name);
638 for (i = 0; i < r->var_cnt; i++)
644 struct missing_values miss;
645 struct fmt_spec print, write;
646 bool report_error = true;
650 error (r, _("Expected variable record."));
652 width = read_int (r);
654 error (r, _("Invalid variable width %d."), width);
655 r->widths[i] = width;
657 read_string (r, name);
658 for (j = 0; j < 6; j++)
659 fmt[j] = read_int (r);
661 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
662 error (r, _("Invalid variable name `%s' in position %d."), name, i);
663 str_uppercase (name);
665 if (width < 0 || width > 255)
666 error (r, _("Bad width %d for variable %s."), width, name);
668 v = dict_create_var (dict, name, width);
672 for (i = 1; i < 100000; i++)
674 char try_name[VAR_NAME_LEN + 1];
675 sprintf (try_name, "%.*s_%d", VAR_NAME_LEN - 6, name, i);
676 v = dict_create_var (dict, try_name, width);
681 error (r, _("Duplicate variable name %s in position %d."), name, i);
682 warning (r, _("Duplicate variable name %s in position %d renamed "
683 "to %s."), name, i, var_get_name (v));
686 print = convert_format (r, &fmt[0], v, &report_error);
687 write = convert_format (r, &fmt[3], v, &report_error);
688 var_set_print_format (v, &print);
689 var_set_write_format (v, &write);
691 /* Range missing values. */
692 mv_init (&miss, var_get_width (v));
695 double x = read_float (r);
696 double y = read_float (r);
697 mv_add_range (&miss, x, y);
699 else if (match (r, 'A'))
700 mv_add_range (&miss, read_float (r), HIGHEST);
701 else if (match (r, '9'))
702 mv_add_range (&miss, LOWEST, read_float (r));
704 /* Single missing values. */
705 while (match (r, '8'))
707 union value value = parse_value (r, v);
708 mv_add_value (&miss, &value);
711 var_set_missing_values (v, &miss);
716 read_string (r, label);
717 var_set_label (v, label);
721 if (weight_name != NULL)
723 struct variable *weight_var = dict_lookup_var (dict, weight_name);
724 if (weight_var == NULL)
725 error (r, _("Weighting variable %s not present in dictionary."),
728 dict_set_weight (dict, weight_var);
732 /* Parse a value for variable VV into value V. */
734 parse_value (struct pfm_reader *r, struct variable *vv)
738 if (var_is_alpha (vv))
741 read_string (r, string);
742 buf_copy_str_rpad (v.s, 8, string);
745 v.f = read_float (r);
750 /* Parse a value label record and return success. */
752 read_value_label (struct pfm_reader *r, struct dictionary *dict)
764 v = pool_nalloc (r->pool, nv, sizeof *v);
765 for (i = 0; i < nv; i++)
768 read_string (r, name);
770 v[i] = dict_lookup_var (dict, name);
772 error (r, _("Unknown variable %s while parsing value labels."), name);
774 if (var_get_type (v[0]) != var_get_type (v[i]))
775 error (r, _("Cannot assign value labels to %s and %s, which "
776 "have different variable types."),
777 var_get_name (v[0]), var_get_name (v[i]));
780 n_labels = read_int (r);
781 for (i = 0; i < n_labels; i++)
787 val = parse_value (r, v[0]);
788 read_string (r, label);
790 /* Assign the value label to each variable. */
791 for (j = 0; j < nv; j++)
793 struct variable *var = v[j];
795 if (!var_is_long_string (var))
796 var_replace_value_label (var, &val, label);
801 /* Reads a set of documents from portable file R into DICT. */
803 read_documents (struct pfm_reader *r, struct dictionary *dict)
808 line_cnt = read_int (r);
809 for (i = 0; i < line_cnt; i++)
812 read_string (r, line);
813 dict_add_document_line (dict, line);
817 /* Reads one case from portable file R into C. */
819 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
821 struct pfm_reader *r = r_;
825 case_create (c, casereader_get_value_cnt (reader));
826 setjmp (r->bail_out);
829 casereader_force_error (reader);
834 /* Check for end of file. */
842 for (i = 0; i < r->var_cnt; i++)
844 int width = r->widths[i];
848 case_data_rw_idx (c, idx)->f = read_float (r);
854 read_string (r, string);
855 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
856 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
863 /* Returns true if FILE is an SPSS portable file,
866 pfm_detect (FILE *file)
868 unsigned char header[464];
870 int cooked_cnt, raw_cnt;
873 cooked_cnt = raw_cnt = 0;
874 while (cooked_cnt < sizeof header)
877 if (c == EOF || raw_cnt++ > 512)
879 else if (c != '\n' && c != '\r')
880 header[cooked_cnt++] = c;
883 memset (trans, 0, 256);
884 for (i = 64; i < 256; i++)
886 unsigned char c = header[i + 200];
888 trans[c] = portable_to_local[i];
891 for (i = 0; i < 8; i++)
892 if (trans[header[i + 456]] != "SPSSPORT"[i])
898 static const struct casereader_class por_file_casereader_class =
900 por_file_casereader_read,
901 por_file_casereader_destroy,