1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include "por-file-reader.h"
29 #include <data/casereader-provider.h>
30 #include <data/casereader.h>
31 #include <data/dictionary.h>
32 #include <data/file-handle-def.h>
33 #include <data/file-name.h>
34 #include <data/format.h>
35 #include <data/missing-values.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <libpspp/compiler.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/message.h>
41 #include <libpspp/misc.h>
42 #include <libpspp/pool.h>
43 #include <libpspp/str.h>
48 #define _(msgid) gettext (msgid)
50 /* portable_to_local[PORTABLE] translates the given portable
51 character into the local character set. */
52 static const char portable_to_local[256] =
55 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
56 "<(+|&[]!$*);^-/|,%_>?`:$@'=\" ~- 0123456789 -() {}\\ "
60 /* Portable file reader. */
63 struct pool *pool; /* All the portable file state. */
65 jmp_buf bail_out; /* longjmp() target for error handling. */
67 struct file_handle *fh; /* File handle. */
68 struct fh_lock *lock; /* Read lock for file. */
69 FILE *file; /* File stream. */
70 int line_length; /* Number of characters so far on this line. */
71 char cc; /* Current character. */
72 char *trans; /* 256-byte character set translation table. */
73 int var_cnt; /* Number of variables. */
74 int weight_index; /* 0-based index of weight variable, or -1. */
75 int *widths; /* Variable widths, 0 for numeric. */
76 size_t value_cnt; /* Number of `value's per case. */
77 bool ok; /* Set false on I/O error. */
80 static struct casereader_class por_file_casereader_class;
83 error (struct pfm_reader *r, const char *msg,...)
87 /* Displays MSG as an error message and aborts reading the
88 portable file via longjmp(). */
90 error (struct pfm_reader *r, const char *msg, ...)
96 ds_init_empty (&text);
97 ds_put_format (&text, _("portable file %s corrupt at offset 0x%lx: "),
98 fh_get_file_name (r->fh), ftell (r->file));
100 ds_put_vformat (&text, msg, args);
103 m.category = MSG_GENERAL;
104 m.severity = MSG_ERROR;
105 m.where.file_name = NULL;
106 m.where.line_number = 0;
107 m.text = ds_cstr (&text);
113 longjmp (r->bail_out, 1);
116 /* Displays MSG as an warning for the current position in
117 portable file reader R. */
119 warning (struct pfm_reader *r, const char *msg, ...)
125 ds_init_empty (&text);
126 ds_put_format (&text, _("reading portable file %s at offset 0x%lx: "),
127 fh_get_file_name (r->fh), ftell (r->file));
128 va_start (args, msg);
129 ds_put_vformat (&text, msg, args);
132 m.category = MSG_GENERAL;
133 m.severity = MSG_WARNING;
134 m.where.file_name = NULL;
135 m.where.line_number = 0;
136 m.text = ds_cstr (&text);
141 /* Close and destroy R.
142 Returns false if an error was detected on R, true otherwise. */
144 close_reader (struct pfm_reader *r)
152 if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
154 msg (ME, _("Error closing portable file \"%s\": %s."),
155 fh_get_file_name (r->fh), strerror (errno));
165 pool_destroy (r->pool);
170 /* Closes portable file reader R, after we're done with it. */
172 por_file_casereader_destroy (struct casereader *reader, void *r_)
174 struct pfm_reader *r = r_;
175 if (!close_reader (r))
176 casereader_force_error (reader);
179 /* Read a single character into cur_char. */
181 advance (struct pfm_reader *r)
185 /* Read the next character from the file.
186 Ignore carriage returns entirely.
187 Mostly ignore new-lines, but if a new-line occurs before the
188 line has reached 80 bytes in length, then treat the
189 "missing" bytes as spaces. */
192 while ((c = getc (r->file)) == '\r')
197 if (r->line_length < 80)
200 ungetc ('\n', r->file);
206 error (r, _("unexpected end of file"));
208 if (r->trans != NULL)
214 /* Skip a single character if present, and return whether it was
217 match (struct pfm_reader *r, int c)
228 static void read_header (struct pfm_reader *);
229 static void read_version_data (struct pfm_reader *, struct pfm_read_info *);
230 static void read_variables (struct pfm_reader *, struct dictionary *);
231 static void read_value_label (struct pfm_reader *, struct dictionary *);
232 static void read_documents (struct pfm_reader *, struct dictionary *);
234 /* Reads the dictionary from file with handle H, and returns it in a
235 dictionary structure. This dictionary may be modified in order to
236 rename, reorder, and delete variables, etc. */
238 pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
239 struct pfm_read_info *info)
241 struct pool *volatile pool = NULL;
242 struct pfm_reader *volatile r = NULL;
244 *dict = dict_create ();
246 /* Create and initialize reader. */
247 pool = pool_create ();
248 r = pool_alloc (pool, sizeof *r);
254 r->weight_index = -1;
260 if (setjmp (r->bail_out))
264 r->lock = fh_lock (fh, FH_REF_FILE, "portable file", FH_ACC_READ, false);
269 r->file = fn_open (fh_get_file_name (r->fh), "rb");
272 msg (ME, _("An error occurred while opening \"%s\" for reading "
273 "as a portable file: %s."),
274 fh_get_file_name (r->fh), strerror (errno));
278 /* Read header, version, date info, product id, variables. */
280 read_version_data (r, info);
281 read_variables (r, *dict);
283 /* Read value labels. */
284 while (match (r, 'D'))
285 read_value_label (r, *dict);
287 /* Read documents. */
289 read_documents (r, *dict);
291 /* Check that we've made it to the data. */
293 error (r, _("Data record expected."));
295 r->value_cnt = dict_get_next_value_idx (*dict);
296 return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
297 &por_file_casereader_class, r);
301 dict_destroy (*dict);
306 /* Returns the value of base-30 digit C,
307 or -1 if C is not a base-30 digit. */
309 base_30_value (unsigned char c)
311 static const char base_30_digits[] = "0123456789ABCDEFGHIJKLMNOPQRST";
312 const char *p = strchr (base_30_digits, c);
313 return p != NULL ? p - base_30_digits : -1;
316 /* Read a floating point value and return its value. */
318 read_float (struct pfm_reader *r)
322 bool got_dot = false; /* Seen a decimal point? */
323 bool got_digit = false; /* Seen any digits? */
324 bool negative = false; /* Number is negative? */
326 /* Skip leading spaces. */
327 while (match (r, ' '))
330 /* `*' indicates system-missing. */
333 advance (r); /* Probably a dot (.) but doesn't appear to matter. */
337 negative = match (r, '-');
340 int digit = base_30_value (r->cc);
345 /* Make sure that multiplication by 30 will not overflow. */
346 if (num > DBL_MAX * (1. / 30.))
347 /* The value of the digit doesn't matter, since we have already
348 gotten as many digits as can be represented in a `double'.
349 This doesn't necessarily mean the result will overflow.
350 The exponent may reduce it to within range.
352 We just need to record that there was another
353 digit so that we can multiply by 10 later. */
356 num = (num * 30.0) + digit;
358 /* Keep track of the number of digits after the decimal point.
359 If we just divided by 30 here, we would lose precision. */
363 else if (!got_dot && r->cc == '.')
364 /* Record that we have found the decimal point. */
367 /* Any other character terminates the number. */
373 /* Check that we had some digits. */
375 error (r, _("Number expected."));
377 /* Get exponent if any. */
378 if (r->cc == '+' || r->cc == '-')
381 bool negative_exponent = r->cc == '-';
384 for (advance (r); (digit = base_30_value (r->cc)) != -1; advance (r))
386 if (exp > LONG_MAX / 30)
391 exp = exp * 30 + digit;
394 /* We don't check whether there were actually any digits, but we
396 if (negative_exponent)
401 /* Numbers must end with `/'. */
403 error (r, _("Missing numeric terminator."));
405 /* Multiply `num' by 30 to the `exponent' power, checking for
408 num *= pow (30.0, (double) exponent);
409 else if (exponent > 0)
411 if (num > DBL_MAX * pow (30.0, (double) -exponent))
414 num *= pow (30.0, (double) exponent);
417 return negative ? -num : num;
420 /* Read an integer and return its value. */
422 read_int (struct pfm_reader *r)
424 double f = read_float (r);
425 if (floor (f) != f || f >= INT_MAX || f <= INT_MIN)
426 error (r, _("Invalid integer."));
430 /* Reads a string into BUF, which must have room for 256
433 read_string (struct pfm_reader *r, char *buf)
435 int n = read_int (r);
436 if (n < 0 || n > 255)
437 error (r, _("Bad string length %d."), n);
447 /* Reads a string and returns a copy of it allocated from R's
450 read_pool_string (struct pfm_reader *r)
453 read_string (r, string);
454 return pool_strdup (r->pool, string);
457 /* Reads the 464-byte file header. */
459 read_header (struct pfm_reader *r)
464 /* Read and ignore vanity splash strings. */
465 for (i = 0; i < 200; i++)
468 /* Skip the first 64 characters of the translation table.
469 We don't care about these. They are probably all set to
470 '0', marking them as untranslatable, and that would screw
471 up our actual translation of the real '0'. */
472 for (i = 0; i < 64; i++)
475 /* Read the rest of the translation table. */
476 trans = pool_malloc (r->pool, 256);
477 memset (trans, 0, 256);
486 trans[c] = portable_to_local[i];
489 /* Set up the translation table, then read the first
490 translated character. */
494 /* Skip and verify signature. */
495 for (i = 0; i < 8; i++)
496 if (!match (r, "SPSSPORT"[i]))
498 msg (SE, _("%s: Not a portable file."), fh_get_file_name (r->fh));
499 longjmp (r->bail_out, 1);
503 /* Reads the version and date info record, as well as product and
504 subproduct identification records if present. */
506 read_version_data (struct pfm_reader *r, struct pfm_read_info *info)
508 static char empty_string[] = "";
509 char *date, *time, *product, *author, *subproduct;
514 error (r, _("Unrecognized version code `%c'."), r->cc);
515 date = read_pool_string (r);
516 time = read_pool_string (r);
517 product = match (r, '1') ? read_pool_string (r) : empty_string;
518 author = match (r, '2') ? read_pool_string (r) : empty_string;
519 subproduct = match (r, '3') ? read_pool_string (r) : empty_string;
522 if (strlen (date) != 8)
523 error (r, _("Bad date string length %zu."), strlen (date));
524 if (strlen (time) != 6)
525 error (r, _("Bad time string length %zu."), strlen (time));
527 /* Save file info. */
531 for (i = 0; i < 8; i++)
533 static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1};
534 info->creation_date[map[i]] = date[i];
536 info->creation_date[2] = info->creation_date[5] = ' ';
537 info->creation_date[10] = 0;
540 for (i = 0; i < 6; i++)
542 static const int map[] = {0, 1, 3, 4, 6, 7};
543 info->creation_time[map[i]] = time[i];
545 info->creation_time[2] = info->creation_time[5] = ' ';
546 info->creation_time[8] = 0;
549 str_copy_trunc (info->product, sizeof info->product, product);
550 str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct);
554 /* Translates a format specification read from portable file R as
555 the three integers INTS into a normal format specifier FORMAT,
556 checking that the format is appropriate for variable V. */
557 static struct fmt_spec
558 convert_format (struct pfm_reader *r, const int portable_format[3],
559 struct variable *v, bool *report_error)
561 struct fmt_spec format;
564 if (!fmt_from_io (portable_format[0], &format.type))
567 warning (r, _("%s: Bad format specifier byte (%d). Variable "
568 "will be assigned a default format."),
569 var_get_name (v), portable_format[0]);
573 format.w = portable_format[1];
574 format.d = portable_format[2];
577 ok = (fmt_check_output (&format)
578 && fmt_check_width_compat (&format, var_get_width (v)));
585 char fmt_string[FMT_STRING_LEN_MAX + 1];
586 fmt_to_string (&format, fmt_string);
587 if (var_is_numeric (v))
588 warning (r, _("Numeric variable %s has invalid format "
590 var_get_name (v), fmt_string);
592 warning (r, _("String variable %s with width %d has "
593 "invalid format specifier %s."),
594 var_get_name (v), var_get_width (v), fmt_string);
602 *report_error = false;
603 return fmt_default_for_width (var_get_width (v));
606 static union value parse_value (struct pfm_reader *, struct variable *);
608 /* Read information on all the variables. */
610 read_variables (struct pfm_reader *r, struct dictionary *dict)
612 char *weight_name = NULL;
616 error (r, _("Expected variable count record."));
618 r->var_cnt = read_int (r);
620 error (r, _("Invalid number of variables %d."), r->var_cnt);
621 r->widths = pool_nalloc (r->pool, r->var_cnt, sizeof *r->widths);
623 /* Purpose of this value is unknown. It is typically 161. */
628 weight_name = read_pool_string (r);
629 if (strlen (weight_name) > SHORT_NAME_LEN)
630 error (r, _("Weight variable name (%s) truncated."), weight_name);
633 for (i = 0; i < r->var_cnt; i++)
639 struct missing_values miss;
640 struct fmt_spec print, write;
641 bool report_error = true;
645 error (r, _("Expected variable record."));
647 width = read_int (r);
649 error (r, _("Invalid variable width %d."), width);
650 r->widths[i] = width;
652 read_string (r, name);
653 for (j = 0; j < 6; j++)
654 fmt[j] = read_int (r);
656 if (!var_is_valid_name (name, false) || *name == '#' || *name == '$')
657 error (r, _("Invalid variable name `%s' in position %d."), name, i);
658 str_uppercase (name);
660 if (width < 0 || width > 255)
661 error (r, _("Bad width %d for variable %s."), width, name);
663 v = dict_create_var (dict, name, width);
667 for (i = 1; i < 100000; i++)
669 char try_name[LONG_NAME_LEN + 1];
670 sprintf (try_name, "%.*s_%d", LONG_NAME_LEN - 6, name, i);
671 v = dict_create_var (dict, try_name, width);
676 error (r, _("Duplicate variable name %s in position %d."), name, i);
677 warning (r, _("Duplicate variable name %s in position %d renamed "
678 "to %s."), name, i, var_get_name (v));
681 print = convert_format (r, &fmt[0], v, &report_error);
682 write = convert_format (r, &fmt[3], v, &report_error);
683 var_set_print_format (v, &print);
684 var_set_write_format (v, &write);
686 /* Range missing values. */
687 mv_init (&miss, var_get_width (v));
690 double x = read_float (r);
691 double y = read_float (r);
692 mv_add_num_range (&miss, x, y);
694 else if (match (r, 'A'))
695 mv_add_num_range (&miss, read_float (r), HIGHEST);
696 else if (match (r, '9'))
697 mv_add_num_range (&miss, LOWEST, read_float (r));
699 /* Single missing values. */
700 while (match (r, '8'))
702 union value value = parse_value (r, v);
703 mv_add_value (&miss, &value);
706 var_set_missing_values (v, &miss);
711 read_string (r, label);
712 var_set_label (v, label);
716 if (weight_name != NULL)
718 struct variable *weight_var = dict_lookup_var (dict, weight_name);
719 if (weight_var == NULL)
720 error (r, _("Weighting variable %s not present in dictionary."),
723 dict_set_weight (dict, weight_var);
727 /* Parse a value for variable VV into value V. */
729 parse_value (struct pfm_reader *r, struct variable *vv)
733 if (var_is_alpha (vv))
736 read_string (r, string);
737 buf_copy_str_rpad (v.s, 8, string);
740 v.f = read_float (r);
745 /* Parse a value label record and return success. */
747 read_value_label (struct pfm_reader *r, struct dictionary *dict)
759 v = pool_nalloc (r->pool, nv, sizeof *v);
760 for (i = 0; i < nv; i++)
763 read_string (r, name);
765 v[i] = dict_lookup_var (dict, name);
767 error (r, _("Unknown variable %s while parsing value labels."), name);
769 if (var_get_type (v[0]) != var_get_type (v[i]))
770 error (r, _("Cannot assign value labels to %s and %s, which "
771 "have different variable types."),
772 var_get_name (v[0]), var_get_name (v[i]));
775 n_labels = read_int (r);
776 for (i = 0; i < n_labels; i++)
782 val = parse_value (r, v[0]);
783 read_string (r, label);
785 /* Assign the value label to each variable. */
786 for (j = 0; j < nv; j++)
788 struct variable *var = v[j];
790 if (!var_is_long_string (var))
791 var_replace_value_label (var, &val, label);
796 /* Reads a set of documents from portable file R into DICT. */
798 read_documents (struct pfm_reader *r, struct dictionary *dict)
803 line_cnt = read_int (r);
804 for (i = 0; i < line_cnt; i++)
807 read_string (r, line);
808 dict_add_document_line (dict, line);
812 /* Reads one case from portable file R into C. */
814 por_file_casereader_read (struct casereader *reader, void *r_, struct ccase *c)
816 struct pfm_reader *r = r_;
820 case_create (c, casereader_get_value_cnt (reader));
821 setjmp (r->bail_out);
824 casereader_force_error (reader);
829 /* Check for end of file. */
837 for (i = 0; i < r->var_cnt; i++)
839 int width = r->widths[i];
843 case_data_rw_idx (c, idx)->f = read_float (r);
849 read_string (r, string);
850 buf_copy_str_rpad (case_data_rw_idx (c, idx)->s, width, string);
851 idx += DIV_RND_UP (width, MAX_SHORT_STRING);
858 /* Returns true if FILE is an SPSS portable file,
861 pfm_detect (FILE *file)
863 unsigned char header[464];
865 int cooked_cnt, raw_cnt;
868 cooked_cnt = raw_cnt = 0;
869 while (cooked_cnt < sizeof header)
872 if (c == EOF || raw_cnt++ > 512)
874 else if (c != '\n' && c != '\r')
875 header[cooked_cnt++] = c;
878 memset (trans, 0, 256);
879 for (i = 64; i < 256; i++)
881 unsigned char c = header[i + 200];
883 trans[c] = portable_to_local[i];
886 for (i = 0; i < 8; i++)
887 if (trans[header[i + 456]] != "SPSSPORT"[i])
893 static struct casereader_class por_file_casereader_class =
895 por_file_casereader_read,
896 por_file_casereader_destroy,