1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <libpspp/alloc.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/message.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/pool.h>
44 #include <libpspp/str.h>
47 #define _(msgid) gettext (msgid)
49 /* portable_to_local[PORTABLE] translates the given portable
50 character into the local character set. */
51 static const char portable_to_local[256] =
54 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
55 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
59 /* Portable file reader. */
62 struct pool *pool; /* All the portable file state. */
64 jmp_buf bail_out; /* longjmp() target for error handling. */
66 struct file_handle *fh; /* File handle. */
67 FILE *file; /* File stream. */
68 int line_length; /* Number of characters so far on this line. */
69 char cc; /* Current character. */
70 char *trans; /* 256-byte character set translation table. */
71 int var_cnt; /* Number of variables. */
72 int weight_index; /* 0-based index of weight variable, or -1. */
73 int *widths; /* Variable widths, 0 for numeric. */
74 size_t value_cnt; /* Number of `value's per case. */
75 bool ok; /* Set false on I/O error. */
78 static struct casereader_class por_file_casereader_class;
81 error (struct pfm_reader *r, const char *msg,...)
85 /* Displays MSG as an error message and aborts reading the
86 portable file via longjmp(). */
88 error (struct pfm_reader *r, const char *msg, ...)
94 ds_init_empty (&text);
95 ds_put_format (&text, _("portable file %s corrupt at offset 0x%lx: "),
96 fh_get_file_name (r->fh), ftell (r->file));
98 ds_put_vformat (&text, msg, args);
101 m.category = MSG_GENERAL;
102 m.severity = MSG_ERROR;
103 m.where.file_name = NULL;
104 m.where.line_number = 0;
105 m.text = ds_cstr (&text);
111 longjmp (r->bail_out, 1);
114 /* Displays MSG as an warning for the current position in
115 portable file reader R. */
117 warning (struct pfm_reader *r, const char *msg, ...)
123 ds_init_empty (&text);
124 ds_put_format (&text, _("reading portable file %s at offset 0x%lx: "),
125 fh_get_file_name (r->fh), ftell (r->file));
126 va_start (args, msg);
127 ds_put_vformat (&text, msg, args);
130 m.category = MSG_GENERAL;
131 m.severity = MSG_WARNING;
132 m.where.file_name = NULL;
133 m.where.line_number = 0;
134 m.text = ds_cstr (&text);
139 /* Close and destroy R.
140 Returns false if an error was detected on R, true otherwise. */
142 close_reader (struct pfm_reader *r)
150 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
152 msg (ME, _("Error closing portable file \"%s\": %s."),
153 fh_get_file_name (r->fh), strerror (errno));
160 fh_close (r->fh, "portable file", "rs");
163 pool_destroy (r->pool);
168 /* Closes portable file reader R, after we're done with it. */
170 por_file_casereader_destroy (struct casereader *reader, void *r_)
172 struct pfm_reader *r = r_;
173 if (!close_reader (r))
174 casereader_force_error (reader);
177 /* Read a single character into cur_char. */
179 advance (struct pfm_reader *r)
183 /* Read the next character from the file.
184 Ignore carriage returns entirely.
185 Mostly ignore new-lines, but if a new-line occurs before the
186 line has reached 80 bytes in length, then treat the
187 "missing" bytes as spaces. */
190 while ((c = getc (r->file)) == '\r')
195 if (r->line_length < 80)
198 ungetc ('\n', r->file);
204 error (r, _("unexpected end of file"));
206 if (r->trans != NULL)
212 /* Skip a single character if present, and return whether it was
215 match (struct pfm_reader *r, int c)
226 static void read_header (struct pfm_reader *);
227 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
228 static void read_variables (struct pfm_reader *, struct dictionary *);
229 static void read_value_label (struct pfm_reader *, struct dictionary *);
230 static void read_documents (struct pfm_reader *, struct dictionary *);
232 /* Reads the dictionary from file with handle H, and returns it in a
233 dictionary structure. This dictionary may be modified in order to
234 rename, reorder, and delete variables, etc. */
236 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
237 struct pfm_read_info *info)
239 struct pool *volatile pool = NULL;
240 struct pfm_reader *volatile r = NULL;
242 *dict = dict_create ();
243 if (!fh_open (fh, FH_REF_FILE, "portable file", "rs"))
246 /* Create and initialize reader. */
247 pool = pool_create ();
248 r = pool_alloc (pool, sizeof *r);
251 r->file = fn_open (fh_get_file_name (r->fh), "rb");
253 r->weight_index = -1;
260 if (setjmp (r->bail_out))
263 /* Check that file open succeeded. */
266 msg (ME, _("An error occurred while opening \"%s\" for reading "
267 "as a portable file: %s."),
268 fh_get_file_name (r->fh), strerror (errno));
272 /* Read header, version, date info, product id, variables. */
274 read_version_data (r, info);
275 read_variables (r, *dict);
277 /* Read value labels. */
278 while (match (r, 'D'))
279 read_value_label (r, *dict);
281 /* Read documents. */
283 read_documents (r, *dict);
285 /* Check that we've made it to the data. */
287 error (r, _("Data record expected."));
289 r->value_cnt = dict_get_next_value_idx (*dict);
290 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
291 &por_file_casereader_class, r);
295 dict_destroy (*dict);
300 /* Returns the value of base-30 digit C,
301 or -1 if C is not a base-30 digit. */
303 base_30_value (unsigned char c)
305 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
306 const char *p = strchr (base_30_digits, c);
307 return p != NULL ? p - base_30_digits : -1;
310 /* Read a floating point value and return its value. */
312 read_float (struct pfm_reader *r)
316 bool got_dot = false; /* Seen a decimal point? */
317 bool got_digit = false; /* Seen any digits? */
318 bool negative = false; /* Number is negative? */
320 /* Skip leading spaces. */
321 while (match (r, ' '))
324 /* `*' indicates system-missing. */
327 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
331 negative = match (r, '-');
334 int digit = base_30_value (r->cc);
339 /* Make sure that multiplication by 30 will not overflow. */
340 if (num > DBL_MAX * (1. / 30.))
341 /* The value of the digit doesn't matter, since we have already
342 gotten as many digits as can be represented in a `double'.
343 This doesn't necessarily mean the result will overflow.
344 The exponent may reduce it to within range.
346 We just need to record that there was another
347 digit so that we can multiply by 10 later. */
350 num = (num * 30.0) + digit;
352 /* Keep track of the number of digits after the decimal point.
353 If we just divided by 30 here, we would lose precision. */
357 else if (!got_dot && r->cc == '.')
358 /* Record that we have found the decimal point. */
361 /* Any other character terminates the number. */
367 /* Check that we had some digits. */
369 error (r, _("Number expected."));
371 /* Get exponent if any. */
372 if (r->cc == '+' || r->cc == '-')
375 bool negative_exponent = r->cc == '-';
378 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
380 if (exp > LONG_MAX / 30)
385 exp = exp * 30 + digit;
388 /* We don't check whether there were actually any digits, but we
390 if (negative_exponent)
395 /* Numbers must end with `/'. */
397 error (r, _("Missing numeric terminator."));
399 /* Multiply `num' by 30 to the `exponent' power, checking for
402 num *= pow (30.0, (double) exponent);
403 else if (exponent > 0)
405 if (num > DBL_MAX * pow (30.0, (double) -exponent))
408 num *= pow (30.0, (double) exponent);
411 return negative ? -num : num;
414 /* Read an integer and return its value. */
416 read_int (struct pfm_reader *r)
418 double f = read_float (r);
419 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
420 error (r, _("Invalid integer."));
424 /* Reads a string into BUF, which must have room for 256
427 read_string (struct pfm_reader *r, char *buf)
429 int n = read_int (r);
430 if (n < 0 || n > 255)
431 error (r, _("Bad string length %d."), n);
441 /* Reads a string and returns a copy of it allocated from R's
444 read_pool_string (struct pfm_reader *r)
447 read_string (r, string);
448 return pool_strdup (r->pool, string);
451 /* Reads the 464-byte file header. */
453 read_header (struct pfm_reader *r)
458 /* Read and ignore vanity splash strings. */
459 for (i = 0; i < 200; i++)
462 /* Skip the first 64 characters of the translation table.
463 We don't care about these. They are probably all set to
464 '0', marking them as untranslatable, and that would screw
465 up our actual translation of the real '0'. */
466 for (i = 0; i < 64; i++)
469 /* Read the rest of the translation table. */
470 trans = pool_malloc (r->pool, 256);
471 memset (trans, 0, 256);
480 trans[c] = portable_to_local[i];
483 /* Set up the translation table, then read the first
484 translated character. */
488 /* Skip and verify signature. */
489 for (i = 0; i < 8; i++)
490 if (!match (r, "SPSSPORT"[i]))
492 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
493 longjmp (r->bail_out, 1);
497 /* Reads the version and date info record, as well as product and
498 subproduct identification records if present. */
500 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
502 static char empty_string[] = "";
503 char *date, *time, *product, *author, *subproduct;
508 error (r, _("Unrecognized version code `%c'."), r->cc);
509 date = read_pool_string (r);
510 time = read_pool_string (r);
511 product = match (r, '1') ? read_pool_string (r) : empty_string;
512 author = match (r, '2') ? read_pool_string (r) : empty_string;
513 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
516 if (strlen (date) != 8)
517 error (r, _("Bad date string length %d."), (int) strlen (date));
518 if (strlen (time) != 6)
519 error (r, _("Bad time string length %d."), (int) strlen (time));
521 /* Save file info. */
525 for (i = 0; i < 8; i++)
527 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
528 info->creation_date[map[i]] = date[i];
530 info->creation_date[2] = info->creation_date[5] = ' ';
531 info->creation_date[10] = 0;
534 for (i = 0; i < 6; i++)
536 static const int map[] = {0, 1, 3, 4, 6, 7};
537 info->creation_time[map[i]] = time[i];
539 info->creation_time[2] = info->creation_time[5] = ' ';
540 info->creation_time[8] = 0;
543 str_copy_trunc (info->product, sizeof info->product, product);
544 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
548 /* Translates a format specification read from portable file R as
549 the three integers INTS into a normal format specifier FORMAT,
550 checking that the format is appropriate for variable V. */
551 static struct fmt_spec
552 convert_format (struct pfm_reader *r, const int portable_format[3],
553 struct variable *v, bool *report_error)
555 struct fmt_spec format;
558 if (!fmt_from_io (portable_format[0], &format.type))
561 warning (r, _("%s: Bad format specifier byte (%d). Variable "
562 "will be assigned a default format."),
563 var_get_name (v), portable_format[0]);
567 format.w = portable_format[1];
568 format.d = portable_format[2];
571 ok = (fmt_check_output (&format)
572 && fmt_check_width_compat (&format, var_get_width (v)));
579 char fmt_string[FMT_STRING_LEN_MAX + 1];
580 fmt_to_string (&format, fmt_string);
581 if (var_is_numeric (v))
582 warning (r, _("Numeric variable %s has invalid format "
584 var_get_name (v), fmt_string);
586 warning (r, _("String variable %s with width %d has "
587 "invalid format specifier %s."),
588 var_get_name (v), var_get_width (v), fmt_string);
596 *report_error = false;
597 return fmt_default_for_width (var_get_width (v));
600 static union value parse_value (struct pfm_reader *, struct variable *);
602 /* Read information on all the variables. */
604 read_variables (struct pfm_reader *r, struct dictionary *dict)
606 char *weight_name = NULL;
610 error (r, _("Expected variable count record."));
612 r->var_cnt = read_int (r);
614 error (r, _("Invalid number of variables %d."), r->var_cnt);
615 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
617 /* Purpose of this value is unknown. It is typically 161. */
622 weight_name = read_pool_string (r);
623 if (strlen (weight_name) > SHORT_NAME_LEN)
624 error (r, _("Weight variable name (%s) truncated."), weight_name);
627 for (i = 0; i < r->var_cnt; i++)
633 struct missing_values miss;
634 struct fmt_spec print, write;
635 bool report_error = true;
639 error (r, _("Expected variable record."));
641 width = read_int (r);
643 error (r, _("Invalid variable width %d."), width);
644 r->widths[i] = width;
646 read_string (r, name);
647 for (j = 0; j < 6; j++)
648 fmt[j] = read_int (r);
650 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
651 error (r, _("Invalid variable name `%s' in position %d."), name, i);
652 str_uppercase (name);
654 if (width < 0 || width > 255)
655 error (r, _("Bad width %d for variable %s."), width, name);
657 v = dict_create_var (dict, name, width);
661 for (i = 1; i < 100000; i++)
663 char try_name[LONG_NAME_LEN + 1];
664 sprintf (try_name, "%.*s_%d", LONG_NAME_LEN - 6, name, i);
665 v = dict_create_var (dict, try_name, width);
670 error (r, _("Duplicate variable name %s in position %d."), name, i);
671 warning (r, _("Duplicate variable name %s in position %d renamed "
672 "to %s."), name, i, var_get_name (v));
675 print = convert_format (r, &fmt[0], v, &report_error);
676 write = convert_format (r, &fmt[3], v, &report_error);
677 var_set_print_format (v, &print);
678 var_set_write_format (v, &write);
680 /* Range missing values. */
681 mv_init (&miss, var_get_width (v));
684 double x = read_float (r);
685 double y = read_float (r);
686 mv_add_num_range (&miss, x, y);
688 else if (match (r, 'A'))
689 mv_add_num_range (&miss, read_float (r), HIGHEST);
690 else if (match (r, '9'))
691 mv_add_num_range (&miss, LOWEST, read_float (r));
693 /* Single missing values. */
694 while (match (r, '8'))
696 union value value = parse_value (r, v);
697 mv_add_value (&miss, &value);
700 var_set_missing_values (v, &miss);
705 read_string (r, label);
706 var_set_label (v, label);
710 if (weight_name != NULL)
712 struct variable *weight_var = dict_lookup_var (dict, weight_name);
713 if (weight_var == NULL)
714 error (r, _("Weighting variable %s not present in dictionary."),
717 dict_set_weight (dict, weight_var);
721 /* Parse a value for variable VV into value V. */
723 parse_value (struct pfm_reader *r, struct variable *vv)
727 if (var_is_alpha (vv))
730 read_string (r, string);
731 buf_copy_str_rpad (v.s, 8, string);
734 v.f = read_float (r);
739 /* Parse a value label record and return success. */
741 read_value_label (struct pfm_reader *r, struct dictionary *dict)
753 v = pool_nalloc (r->pool, nv, sizeof *v);
754 for (i = 0; i < nv; i++)
757 read_string (r, name);
759 v[i] = dict_lookup_var (dict, name);
761 error (r, _("Unknown variable %s while parsing value labels."), name);
763 if (var_get_type (v[0]) != var_get_type (v[i]))
764 error (r, _("Cannot assign value labels to %s and %s, which "
765 "have different variable types."),
766 var_get_name (v[0]), var_get_name (v[i]));
769 n_labels = read_int (r);
770 for (i = 0; i < n_labels; i++)
776 val = parse_value (r, v[0]);
777 read_string (r, label);
779 /* Assign the value label to each variable. */
780 for (j = 0; j < nv; j++)
782 struct variable *var = v[j];
784 if (!var_is_long_string (var))
785 var_replace_value_label (var, &val, label);
790 /* Reads a set of documents from portable file R into DICT. */
792 read_documents (struct pfm_reader *r, struct dictionary *dict)
797 line_cnt = read_int (r);
798 for (i = 0; i < line_cnt; i++)
801 read_string (r, line);
802 dict_add_document_line (dict, line);
806 /* Reads one case from portable file R into C. */
808 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
810 struct pfm_reader *r = r_;
814 case_create (c, casereader_get_value_cnt (reader));
815 setjmp (r->bail_out);
818 casereader_force_error (reader);
823 /* Check for end of file. */
831 for (i = 0; i < r->var_cnt; i++)
833 int width = r->widths[i];
837 case_data_rw_idx (c, idx)->f = read_float (r);
843 read_string (r, string);
844 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
845 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
852 /* Returns true if FILE is an SPSS portable file,
855 pfm_detect (FILE *file)
857 unsigned char header[464];
859 int cooked_cnt, raw_cnt;
862 cooked_cnt = raw_cnt = 0;
863 while (cooked_cnt < sizeof header)
866 if (c == EOF || raw_cnt++ > 512)
868 else if (c != '\n' && c != '\r')
869 header[cooked_cnt++] = c;
872 memset (trans, 0, 256);
873 for (i = 64; i < 256; i++)
875 unsigned char c = header[i + 200];
877 trans[c] = portable_to_local[i];
880 for (i = 0; i < 8; i++)
881 if (trans[header[i + 456]] != "SPSSPORT"[i])
887 static struct casereader_class por_file_casereader_class =
889 por_file_casereader_read,
890 por_file_casereader_destroy,