1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <libpspp/alloc.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/magic.h>
42 #include <libpspp/message.h>
43 #include <libpspp/misc.h>
44 #include <libpspp/pool.h>
45 #include <libpspp/str.h>
48 #define _(msgid) gettext (msgid)
50 /* portable_to_local[PORTABLE] translates the given portable
51 character into the local character set. */
52 static const char portable_to_local[256] =
55 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
56 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
60 /* Portable file reader. */
63 struct pool *pool; /* All the portable file state. */
65 jmp_buf bail_out; /* longjmp() target for error handling. */
67 struct file_handle *fh; /* File handle. */
68 FILE *file; /* File stream. */
69 int line_length; /* Number of characters so far on this line. */
70 char cc; /* Current character. */
71 char *trans; /* 256-byte character set translation table. */
72 int var_cnt; /* Number of variables. */
73 int weight_index; /* 0-based index of weight variable, or -1. */
74 int *widths; /* Variable widths, 0 for numeric. */
75 size_t value_cnt; /* Number of `value's per case. */
76 bool ok; /* Set false on I/O error. */
79 static struct casereader_class por_file_casereader_class;
82 error (struct pfm_reader *r, const char *msg,...)
86 /* Displays MSG as an error message and aborts reading the
87 portable file via longjmp(). */
89 error (struct pfm_reader *r, const char *msg, ...)
95 ds_init_empty (&text);
96 ds_put_format (&text, _("portable file %s corrupt at offset 0x%lx: "),
97 fh_get_file_name (r->fh), ftell (r->file));
99 ds_put_vformat (&text, msg, args);
102 m.category = MSG_GENERAL;
103 m.severity = MSG_ERROR;
104 m.where.file_name = NULL;
105 m.where.line_number = 0;
106 m.text = ds_cstr (&text);
112 longjmp (r->bail_out, 1);
115 /* Displays MSG as an warning for the current position in
116 portable file reader R. */
118 warning (struct pfm_reader *r, const char *msg, ...)
124 ds_init_empty (&text);
125 ds_put_format (&text, _("reading portable file %s at offset 0x%lx: "),
126 fh_get_file_name (r->fh), ftell (r->file));
127 va_start (args, msg);
128 ds_put_vformat (&text, msg, args);
131 m.category = MSG_GENERAL;
132 m.severity = MSG_WARNING;
133 m.where.file_name = NULL;
134 m.where.line_number = 0;
135 m.text = ds_cstr (&text);
140 /* Close and destroy R.
141 Returns false if an error was detected on R, true otherwise. */
143 close_reader (struct pfm_reader *r)
151 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
153 msg (ME, _("Error closing portable file \"%s\": %s."),
154 fh_get_file_name (r->fh), strerror (errno));
161 fh_close (r->fh, "portable file", "rs");
164 pool_destroy (r->pool);
169 /* Closes portable file reader R, after we're done with it. */
171 por_file_casereader_destroy (struct casereader *reader, void *r_)
173 struct pfm_reader *r = r_;
174 if (!close_reader (r))
175 casereader_force_error (reader);
178 /* Read a single character into cur_char. */
180 advance (struct pfm_reader *r)
184 /* Read the next character from the file.
185 Ignore carriage returns entirely.
186 Mostly ignore new-lines, but if a new-line occurs before the
187 line has reached 80 bytes in length, then treat the
188 "missing" bytes as spaces. */
191 while ((c = getc (r->file)) == '\r')
196 if (r->line_length < 80)
199 ungetc ('\n', r->file);
205 error (r, _("unexpected end of file"));
207 if (r->trans != NULL)
213 /* Skip a single character if present, and return whether it was
216 match (struct pfm_reader *r, int c)
227 static void read_header (struct pfm_reader *);
228 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
229 static void read_variables (struct pfm_reader *, struct dictionary *);
230 static void read_value_label (struct pfm_reader *, struct dictionary *);
231 static void read_documents (struct pfm_reader *, struct dictionary *);
233 /* Reads the dictionary from file with handle H, and returns it in a
234 dictionary structure. This dictionary may be modified in order to
235 rename, reorder, and delete variables, etc. */
237 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
238 struct pfm_read_info *info)
240 struct pool *volatile pool = NULL;
241 struct pfm_reader *volatile r = NULL;
243 *dict = dict_create ();
244 if (!fh_open (fh, FH_REF_FILE, "portable file", "rs"))
247 /* Create and initialize reader. */
248 pool = pool_create ();
249 r = pool_alloc (pool, sizeof *r);
252 r->file = fn_open (fh_get_file_name (r->fh), "rb");
254 r->weight_index = -1;
261 if (setjmp (r->bail_out))
264 /* Check that file open succeeded. */
267 msg (ME, _("An error occurred while opening \"%s\" for reading "
268 "as a portable file: %s."),
269 fh_get_file_name (r->fh), strerror (errno));
273 /* Read header, version, date info, product id, variables. */
275 read_version_data (r, info);
276 read_variables (r, *dict);
278 /* Read value labels. */
279 while (match (r, 'D'))
280 read_value_label (r, *dict);
282 /* Read documents. */
284 read_documents (r, *dict);
286 /* Check that we've made it to the data. */
288 error (r, _("Data record expected."));
290 r->value_cnt = dict_get_next_value_idx (*dict);
291 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
292 &por_file_casereader_class, r);
296 dict_destroy (*dict);
301 /* Returns the value of base-30 digit C,
302 or -1 if C is not a base-30 digit. */
304 base_30_value (unsigned char c)
306 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
307 const char *p = strchr (base_30_digits, c);
308 return p != NULL ? p - base_30_digits : -1;
311 /* Read a floating point value and return its value. */
313 read_float (struct pfm_reader *r)
317 bool got_dot = false; /* Seen a decimal point? */
318 bool got_digit = false; /* Seen any digits? */
319 bool negative = false; /* Number is negative? */
321 /* Skip leading spaces. */
322 while (match (r, ' '))
325 /* `*' indicates system-missing. */
328 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
332 negative = match (r, '-');
335 int digit = base_30_value (r->cc);
340 /* Make sure that multiplication by 30 will not overflow. */
341 if (num > DBL_MAX * (1. / 30.))
342 /* The value of the digit doesn't matter, since we have already
343 gotten as many digits as can be represented in a `double'.
344 This doesn't necessarily mean the result will overflow.
345 The exponent may reduce it to within range.
347 We just need to record that there was another
348 digit so that we can multiply by 10 later. */
351 num = (num * 30.0) + digit;
353 /* Keep track of the number of digits after the decimal point.
354 If we just divided by 30 here, we would lose precision. */
358 else if (!got_dot && r->cc == '.')
359 /* Record that we have found the decimal point. */
362 /* Any other character terminates the number. */
368 /* Check that we had some digits. */
370 error (r, _("Number expected."));
372 /* Get exponent if any. */
373 if (r->cc == '+' || r->cc == '-')
376 bool negative_exponent = r->cc == '-';
379 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
381 if (exp > LONG_MAX / 30)
386 exp = exp * 30 + digit;
389 /* We don't check whether there were actually any digits, but we
391 if (negative_exponent)
396 /* Numbers must end with `/'. */
398 error (r, _("Missing numeric terminator."));
400 /* Multiply `num' by 30 to the `exponent' power, checking for
403 num *= pow (30.0, (double) exponent);
404 else if (exponent > 0)
406 if (num > DBL_MAX * pow (30.0, (double) -exponent))
409 num *= pow (30.0, (double) exponent);
412 return negative ? -num : num;
415 /* Read an integer and return its value. */
417 read_int (struct pfm_reader *r)
419 double f = read_float (r);
420 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
421 error (r, _("Invalid integer."));
425 /* Reads a string into BUF, which must have room for 256
428 read_string (struct pfm_reader *r, char *buf)
430 int n = read_int (r);
431 if (n < 0 || n > 255)
432 error (r, _("Bad string length %d."), n);
442 /* Reads a string and returns a copy of it allocated from R's
445 read_pool_string (struct pfm_reader *r)
448 read_string (r, string);
449 return pool_strdup (r->pool, string);
452 /* Reads the 464-byte file header. */
454 read_header (struct pfm_reader *r)
459 /* Read and ignore vanity splash strings. */
460 for (i = 0; i < 200; i++)
463 /* Skip the first 64 characters of the translation table.
464 We don't care about these. They are probably all set to
465 '0', marking them as untranslatable, and that would screw
466 up our actual translation of the real '0'. */
467 for (i = 0; i < 64; i++)
470 /* Read the rest of the translation table. */
471 trans = pool_malloc (r->pool, 256);
472 memset (trans, 0, 256);
481 trans[c] = portable_to_local[i];
484 /* Set up the translation table, then read the first
485 translated character. */
489 /* Skip and verify signature. */
490 for (i = 0; i < 8; i++)
491 if (!match (r, "SPSSPORT"[i]))
493 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
494 longjmp (r->bail_out, 1);
498 /* Reads the version and date info record, as well as product and
499 subproduct identification records if present. */
501 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
503 static char empty_string[] = "";
504 char *date, *time, *product, *author, *subproduct;
509 error (r, _("Unrecognized version code `%c'."), r->cc);
510 date = read_pool_string (r);
511 time = read_pool_string (r);
512 product = match (r, '1') ? read_pool_string (r) : empty_string;
513 author = match (r, '2') ? read_pool_string (r) : empty_string;
514 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
517 if (strlen (date) != 8)
518 error (r, _("Bad date string length %d."), (int) strlen (date));
519 if (strlen (time) != 6)
520 error (r, _("Bad time string length %d."), (int) strlen (time));
522 /* Save file info. */
526 for (i = 0; i < 8; i++)
528 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
529 info->creation_date[map[i]] = date[i];
531 info->creation_date[2] = info->creation_date[5] = ' ';
532 info->creation_date[10] = 0;
535 for (i = 0; i < 6; i++)
537 static const int map[] = {0, 1, 3, 4, 6, 7};
538 info->creation_time[map[i]] = time[i];
540 info->creation_time[2] = info->creation_time[5] = ' ';
541 info->creation_time[8] = 0;
544 str_copy_trunc (info->product, sizeof info->product, product);
545 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
549 /* Translates a format specification read from portable file R as
550 the three integers INTS into a normal format specifier FORMAT,
551 checking that the format is appropriate for variable V. */
552 static struct fmt_spec
553 convert_format (struct pfm_reader *r, const int portable_format[3],
554 struct variable *v, bool *report_error)
556 struct fmt_spec format;
559 if (!fmt_from_io (portable_format[0], &format.type))
562 warning (r, _("%s: Bad format specifier byte (%d). Variable "
563 "will be assigned a default format."),
564 var_get_name (v), portable_format[0]);
568 format.w = portable_format[1];
569 format.d = portable_format[2];
572 ok = (fmt_check_output (&format)
573 && fmt_check_width_compat (&format, var_get_width (v)));
580 char fmt_string[FMT_STRING_LEN_MAX + 1];
581 fmt_to_string (&format, fmt_string);
582 if (var_is_numeric (v))
583 warning (r, _("Numeric variable %s has invalid format "
585 var_get_name (v), fmt_string);
587 warning (r, _("String variable %s with width %d has "
588 "invalid format specifier %s."),
589 var_get_name (v), var_get_width (v), fmt_string);
597 *report_error = false;
598 return fmt_default_for_width (var_get_width (v));
601 static union value parse_value (struct pfm_reader *, struct variable *);
603 /* Read information on all the variables. */
605 read_variables (struct pfm_reader *r, struct dictionary *dict)
607 char *weight_name = NULL;
611 error (r, _("Expected variable count record."));
613 r->var_cnt = read_int (r);
614 if (r->var_cnt <= 0 || r->var_cnt == NOT_INT)
615 error (r, _("Invalid number of variables %d."), r->var_cnt);
616 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
618 /* Purpose of this value is unknown. It is typically 161. */
623 weight_name = read_pool_string (r);
624 if (strlen (weight_name) > SHORT_NAME_LEN)
625 error (r, _("Weight variable name (%s) truncated."), weight_name);
628 for (i = 0; i < r->var_cnt; i++)
634 struct missing_values miss;
635 struct fmt_spec print, write;
636 bool report_error = true;
640 error (r, _("Expected variable record."));
642 width = read_int (r);
644 error (r, _("Invalid variable width %d."), width);
645 r->widths[i] = width;
647 read_string (r, name);
648 for (j = 0; j < 6; j++)
649 fmt[j] = read_int (r);
651 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
652 error (r, _("Invalid variable name `%s' in position %d."), name, i);
653 str_uppercase (name);
655 if (width < 0 || width > 255)
656 error (r, _("Bad width %d for variable %s."), width, name);
658 v = dict_create_var (dict, name, width);
662 for (i = 1; i < 100000; i++)
664 char try_name[LONG_NAME_LEN + 1];
665 sprintf (try_name, "%.*s_%d", LONG_NAME_LEN - 6, name, i);
666 v = dict_create_var (dict, try_name, width);
671 error (r, _("Duplicate variable name %s in position %d."), name, i);
672 warning (r, _("Duplicate variable name %s in position %d renamed "
673 "to %s."), name, i, var_get_name (v));
676 print = convert_format (r, &fmt[0], v, &report_error);
677 write = convert_format (r, &fmt[3], v, &report_error);
678 var_set_print_format (v, &print);
679 var_set_write_format (v, &write);
681 /* Range missing values. */
682 mv_init (&miss, var_get_width (v));
685 double x = read_float (r);
686 double y = read_float (r);
687 mv_add_num_range (&miss, x, y);
689 else if (match (r, 'A'))
690 mv_add_num_range (&miss, read_float (r), HIGHEST);
691 else if (match (r, '9'))
692 mv_add_num_range (&miss, LOWEST, read_float (r));
694 /* Single missing values. */
695 while (match (r, '8'))
697 union value value = parse_value (r, v);
698 mv_add_value (&miss, &value);
701 var_set_missing_values (v, &miss);
706 read_string (r, label);
707 var_set_label (v, label);
711 if (weight_name != NULL)
713 struct variable *weight_var = dict_lookup_var (dict, weight_name);
714 if (weight_var == NULL)
715 error (r, _("Weighting variable %s not present in dictionary."),
718 dict_set_weight (dict, weight_var);
722 /* Parse a value for variable VV into value V. */
724 parse_value (struct pfm_reader *r, struct variable *vv)
728 if (var_is_alpha (vv))
731 read_string (r, string);
732 buf_copy_str_rpad (v.s, 8, string);
735 v.f = read_float (r);
740 /* Parse a value label record and return success. */
742 read_value_label (struct pfm_reader *r, struct dictionary *dict)
754 v = pool_nalloc (r->pool, nv, sizeof *v);
755 for (i = 0; i < nv; i++)
758 read_string (r, name);
760 v[i] = dict_lookup_var (dict, name);
762 error (r, _("Unknown variable %s while parsing value labels."), name);
764 if (var_get_type (v[0]) != var_get_type (v[i]))
765 error (r, _("Cannot assign value labels to %s and %s, which "
766 "have different variable types."),
767 var_get_name (v[0]), var_get_name (v[i]));
770 n_labels = read_int (r);
771 for (i = 0; i < n_labels; i++)
777 val = parse_value (r, v[0]);
778 read_string (r, label);
780 /* Assign the value label to each variable. */
781 for (j = 0; j < nv; j++)
783 struct variable *var = v[j];
785 if (!var_is_long_string (var))
786 var_replace_value_label (var, &val, label);
791 /* Reads a set of documents from portable file R into DICT. */
793 read_documents (struct pfm_reader *r, struct dictionary *dict)
798 line_cnt = read_int (r);
799 for (i = 0; i < line_cnt; i++)
802 read_string (r, line);
803 dict_add_document_line (dict, line);
807 /* Reads one case from portable file R into C. */
809 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
811 struct pfm_reader *r = r_;
815 case_create (c, casereader_get_value_cnt (reader));
816 setjmp (r->bail_out);
819 casereader_force_error (reader);
824 /* Check for end of file. */
832 for (i = 0; i < r->var_cnt; i++)
834 int width = r->widths[i];
838 case_data_rw_idx (c, idx)->f = read_float (r);
844 read_string (r, string);
845 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
846 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
853 /* Returns true if FILE is an SPSS portable file,
856 pfm_detect (FILE *file)
858 unsigned char header[464];
860 int cooked_cnt, raw_cnt;
863 cooked_cnt = raw_cnt = 0;
864 while (cooked_cnt < sizeof header)
867 if (c == EOF || raw_cnt++ > 512)
869 else if (c != '\n' && c != '\r')
870 header[cooked_cnt++] = c;
873 memset (trans, 0, 256);
874 for (i = 64; i < 256; i++)
876 unsigned char c = header[i + 200];
878 trans[c] = portable_to_local[i];
881 for (i = 0; i < 8; i++)
882 if (trans[header[i + 456]] != "SPSSPORT"[i])
888 static struct casereader_class por_file_casereader_class =
890 por_file_casereader_read,
891 por_file_casereader_destroy,