From d0371553a98cd169353bf6d211e375e5ffc3a3bd Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 9 Nov 2007 03:06:28 +0000 Subject: [PATCH] * automake.mk: Add new file. * formats/360.sh: New test. * str.c (ds_read_stream): Change return value semantics to be more useful. Update all users. * data-reader.c (struct dfm_reader): New member `block_left'. (dfm_open_reader): Initialize block_left. For FH_MODE_TEXT, open the file in text mode. (read_error): New function. (partial_record): New function. (try_to_read_fully): New function. (enum descriptor_type): New enum. (read_descriptor_word): New function. (corrupt_size): New function. (read_size): New function. (read_file_record): Implement new modes. (read_record): Now take care of tracking line numbers here. (dfm_reader_get_legacy_encoding): New function. * data-writer.c (dfm_put_record): Implement new modes. (dfm_writer_get_legacy_encoding): New function. * file-handle.q: Parse new formats. (cmd_file_handle): Set up new formats. * print.c (struct print_trns): New member `encoding'. (internal_cmd_print): Set encoding. (print_trns_proc): Recode output data if necessary. (flush_records): Recode leader byte. * data-in.c: Make formatted data parsing locale-independent. (parse_number): Use c_strtod instead of strtod, to avoid locale-specific behavior. (parse_Z): Ditto. --- doc/data-io.texi | 150 ++++++++++-- src/data/ChangeLog | 31 +++ src/data/data-in.c | 58 ++++- src/data/data-in.h | 3 +- src/data/data-out.c | 20 +- src/data/data-out.h | 4 + src/data/file-handle-def.c | 12 +- src/data/file-handle-def.h | 8 +- src/language/data-io/ChangeLog | 30 +++ src/language/data-io/data-list.c | 14 +- src/language/data-io/data-reader.c | 295 +++++++++++++++++++++--- src/language/data-io/data-reader.h | 3 + src/language/data-io/data-writer.c | 50 +++- src/language/data-io/data-writer.h | 2 + src/language/data-io/file-handle.q | 95 +++++--- src/language/data-io/print.c | 26 ++- src/language/expressions/operations.def | 2 +- src/language/lexer/range-parser.c | 3 +- src/language/xforms/recode.c | 3 +- src/libpspp/ChangeLog | 5 + src/libpspp/str.c | 17 +- src/libpspp/str.h | 2 +- src/ui/gui/find-dialog.c | 1 + src/ui/gui/helper.c | 2 +- src/ui/gui/psppire-case-file.c | 2 +- tests/ChangeLog | 9 + tests/automake.mk | 1 + tests/formats/360.sh | 254 ++++++++++++++++++++ 28 files changed, 982 insertions(+), 120 deletions(-) create mode 100755 tests/formats/360.sh diff --git a/doc/data-io.texi b/doc/data-io.texi index 3439ca3a..9d3fe39c 100644 --- a/doc/data-io.texi +++ b/doc/data-io.texi @@ -438,12 +438,25 @@ For text files: [/MODE=CHARACTER] /TABWIDTH=tab_width -For binary files with fixed-length records: +For binary files in native encoding with fixed-length records: FILE HANDLE handle_name /NAME='file-name' /MODE=IMAGE [/LRECL=rec_len] +For binary files in native encoding with variable-length records: + FILE HANDLE handle_name + /NAME='file-name' + /MODE=BINARY + [/LRECL=rec_len] + +For binary files encoded in EBCDIC: + FILE HANDLE handle_name + /NAME='file-name' + /MODE=360 + /RECFORM=@{FIXED,VARIABLE,SPANNED@} + [/LRECL=rec_len] + To explicitly declare a scratch handle: FILE HANDLE handle_name /MODE=SCRATCH @@ -465,28 +478,129 @@ file handle name must not already have been used in a previous invocation of @cmd{FILE HANDLE}, unless it has been closed by an intervening command (@pxref{CLOSE FILE HANDLE}). -MODE specifies a file mode. In CHARACTER mode, the default, the data -file is read as a text file, according to the local system's -conventions, and each text line is read as one record. -In CHARACTER mode, most input programs will expand tabs to spaces -(@cmd{DATA LIST FREE} with explicitly specified delimiters is an -exception). By default, each tab is 4 characters wide, but an -alternate width may be specified on TABWIDTH. A tab width of 0 -suppresses tab expansion entirely. - -In IMAGE mode, the data file is opened in ANSI C binary mode. Record -length is fixed, with output data truncated or padded with spaces to -the record length. LRECL specifies the record length in bytes, with a -default of 1024. Tab characters are never expanded to spaces in -binary mode. Records +The effect and syntax of FILE HANDLE depends on the selected MODE: -The NAME subcommand specifies the name of the file associated with the -handle. It is required in CHARACTER and IMAGE modes. +@itemize +@item +In CHARACTER mode, the default, the data file is read as a text file, +according to the local system's conventions, and each text line is +read as one record. + +In CHARACTER mode only, tabs are expanded to spaces by input programs, +except by @cmd{DATA LIST FREE} with explicitly specified delimiters. +Each tab is 4 characters wide by default, but TABWIDTH (a PSPP +extension) may be used to specify an alternate width. Use a TABWIDTH +of 0 to suppress tab expansion. -The SCRATCH mode designates the file handle as a scratch file handle. +@item +In IMAGE mode, the data file is treated as a series of fixed-length +binary records. LRECL should be used to specify the record length in +bytes, with a default of 1024. On input, it is an error if an IMAGE +file's length is not a integer multiple of the record length. On +output, each record is padded with spaces or truncated, if necessary, +to make it exactly the correct length. + +@item +In BINARY mode, the data file is treated as a series of +variable-length binary records. LRECL may be specified, but its value +is ignored. The data for each record is both preceded and followed by +a 32-bit signed integer in little-endian byte order that specifies the +length of the record. (This redundancy permits records in these +files to be efficiently read in reverse order, although PSPP always +reads them in forward order.) The length does not include either +integer. + +@item +Mode 360 reads and writes files in formats first used for tapes in the +1960s on IBM mainframe operating systems and still supported today by +the modern successors of those operating systems. For more +information, see @cite{OS/400 Tape and Diskette Device Programming}, +available on IBM's website. + +Alphanumeric data in mode 360 files are encoded in EBCDIC. PSPP +translates EBCDIC to or from the host's native format as necessary on +input or output, using an ASCII/EBCDIC translation that is one-to-one, +so that a ``round trip'' from ASCII to EBCDIC back to ASCII, or vice +versa, always yields exactly the original data. + +The RECFORM subcommand is required in mode 360. The precise file +format depends on its setting: + +@table @asis +@item F +@itemx FIXED +This record format is equivalent to IMAGE mode, except for EBCDIC +translation. + +IBM documentation calls this @code{*F} (fixed-length, deblocked) +format. + +@item V +@itemx VARIABLE +The file comprises a sequence of zero or more variable-length blocks. +Each block begins with a 4-byte @dfn{block descriptor word} (BDW). +The first two bytes of the BDW are an unsigned integer in big-endian +byte order that specifies the length of the block, including the BDW +itself. The other two bytes of the BDW are ignored on input and +written as zeros on output. + +Following the BDW, the remainder of each block is a sequence of one or +more variable-length records, each of which in turn begins with a +4-byte @dfn{record descriptor word} (RDW) that has the same format as +the BDW. Following the RDW, the remainder of each record is the +record data. + +The maximum length of a record in VARIABLE mode is 65,527 bytes: +65,535 bytes (the maximum value of a 16-bit unsigned integer), minus 4 +bytes for the BDW, minus 4 bytes for the RDW. + +In mode VARIABLE, LRECL specifies a maximum, not a fixed, record +length, in bytes. The default is 8,192. + +IBM documentation calls this @code{*VB} (variable-length, blocked, +unspanned) format. + +@item VS +@itemx SPANNED +The file format is like that of VARIABLE mode, except that logical +records may be split among multiple physical records (called +@dfn{segments}) or blocks. In SPANNED mode, the third byte of each +RDW is called the segment control character (SCC). Odd SCC values +cause the segment to be appended to a record buffer maintained in +memory; even values also append the segment and then flush its +contents to the input procedure. Canonically, SCC value 0 designates +a record not spanned among multiple segments, and values 1 through 3 +designate the first segment, the last segment, or an intermediate +segment, respectively, within a multi-segment record. The record +buffer is also flushed at end of file regardless of the final record's +SCC. + +The maximum length of a logical record in VARIABLE mode is limited +only by memory available to PSPP. Segments are limited to 65,527 +bytes, as in VARIABLE mode. + +This format is similar to what IBM documentation call @code{*VS} +(variable-length, deblocked, spanned) format. +@end table + +In mode 360, fields of type A that extend beyond the end of a record +read from disk are padded with spaces in the host's native character +set, which are then translated from EBCDIC to the native character +set. Thus, when the host's native character set is based on ASCII, +these fields are effectively padded with character @code{X'80'}. This +wart is implemented for compatibility. + +@item +SCRATCH mode is a PSPP extension that designates the file handle as a +scratch file handle. Its use is usually unnecessary because file handle names that begin with @samp{#} are assumed to refer to scratch files. @pxref{File Handles}, for more information. +@end itemize + +The NAME subcommand specifies the name of the file associated with the +handle. It is required in all modes but SCRATCH mode, in which its +use is forbidden. @node INPUT PROGRAM @section INPUT PROGRAM diff --git a/src/data/ChangeLog b/src/data/ChangeLog index 16ba1ca1..7ddf9fd2 100644 --- a/src/data/ChangeLog +++ b/src/data/ChangeLog @@ -1,3 +1,34 @@ +2007-11-08 Ben Pfaff + + * data-in.c: Make formatted data parsing locale-independent. + (parse_number): Use c_strtod instead of strtod, to avoid + locale-specific behavior. + (parse_Z): Ditto. + +2007-11-06 Ben Pfaff + + Patch #6256: add support for binary, 360 file formats. Reviewed + by John Darrington. + + * data-in.c (struct data_in): Add `encoding' member. + (data_in): Add `encoding' parameter, and re-encode the data passed + in where appropriate. Update all callers to pass it in. + (parse_A): Implement EBCDIC recoding wart described in manual. + (parse_AHEX): Implement EBCDIC recoding. + + * data-out.c (data_out_legacy): New function. + (data_out): Make into a wrapper around data_out_legacy. + + * file-handle-def.c (struct file_handle): New member `encoding'. + (fh_create_file): Set encoding. + (fh_default_properties): Set default encoding. + (fh_get_legacy_encoding): New function. + + * file-handle-def.h (enum fh_mode): New modes FH_MODE_FIXED + (that replaces FH_MODE_BINARY), FH_MODE_VARIABLE, + FH_MODE_360_VARIABLE, FH_MODE_360_SPANNED. + (struct fh_properties): New member `encoding'. + 2007-11-05 Ben Pfaff Patch #6258. Reviewed by John Darrington. diff --git a/src/data/data-in.c b/src/data/data-in.c index 0ddd322b..b9907c2d 100644 --- a/src/data/data-in.c +++ b/src/data/data-in.c @@ -34,13 +34,14 @@ #include "value.h" #include +#include #include #include #include #include #include - #include "c-ctype.h" +#include "c-strtod.h" #include "minmax.h" #include "xalloc.h" @@ -50,6 +51,7 @@ /* Information about parsing one data field. */ struct data_in { + enum legacy_encoding encoding;/* Encoding of source. */ struct substring input; /* Source. */ enum fmt_type format; /* Input format. */ int implied_decimals; /* Number of implied decimal places. */ @@ -83,9 +85,10 @@ static bool trim_spaces_and_check_missing (struct data_in *); static int hexit_value (int c); -/* Parses the characters in INPUT according to FORMAT. Stores - the parsed representation in OUTPUT, which has the given WIDTH - (0 for a numeric field, otherwise the string width). +/* Parses the characters in INPUT, which are encoded in the given + ENCODING, according to FORMAT. Stores the parsed + representation in OUTPUT, which has the given WIDTH (0 for + a numeric field, otherwise the string width). If no decimal point is included in a numeric format, then IMPLIED_DECIMALS decimal places are implied. Specify 0 if no @@ -95,7 +98,7 @@ static int hexit_value (int c); column number of the first character in INPUT, used in error messages. */ bool -data_in (struct substring input, +data_in (struct substring input, enum legacy_encoding encoding, enum fmt_type format, int implied_decimals, int first_column, union value *output, int width) { @@ -106,11 +109,25 @@ data_in (struct substring input, }; struct data_in i; + void *copy = NULL; bool ok; assert ((width != 0) == fmt_is_string (format)); - i.input = input; + if (encoding == LEGACY_NATIVE + || fmt_get_category (format) & (FMT_CAT_BINARY | FMT_CAT_STRING)) + { + i.input = input; + i.encoding = encoding; + } + else + { + ss_alloc_uninit (&i.input, ss_length (input)); + legacy_recode (encoding, ss_data (input), LEGACY_NATIVE, + ss_data (i.input), ss_length (input)); + i.encoding = LEGACY_NATIVE; + copy = ss_data (i.input); + } i.format = format; i.implied_decimals = implied_decimals; @@ -132,6 +149,9 @@ data_in (struct substring input, ok = true; } + if (copy) + free (copy); + return ok; } @@ -271,10 +291,10 @@ parse_number (struct data_in *i) return false; } - /* Let strtod() do the conversion. */ + /* Let c_strtod() do the conversion. */ save_errno = errno; errno = 0; - i->output->f = strtod (ds_cstr (&tmp), &tail); + i->output->f = c_strtod (ds_cstr (&tmp), &tail); if (*tail != '\0') { data_warning (i, _("Invalid numeric syntax.")); @@ -461,10 +481,10 @@ parse_Z (struct data_in *i) return false; } - /* Let strtod() do the conversion. */ + /* Let c_strtod() do the conversion. */ save_errno = errno; errno = 0; - i->output->f = strtod (ds_cstr (&tmp), NULL); + i->output->f = c_strtod (ds_cstr (&tmp), NULL); if (errno == ERANGE) { if (fabs (i->output->f) > 1) @@ -609,8 +629,17 @@ parse_RB (struct data_in *i) static bool parse_A (struct data_in *i) { - buf_copy_rpad (i->output->s, i->width, - ss_data (i->input), ss_length (i->input)); + /* This is equivalent to buf_copy_rpad, except that we posibly + do a character set recoding in the middle. */ + char *dst = i->output->s; + size_t dst_size = i->width; + const char *src = ss_data (i->input); + size_t src_size = ss_length (i->input); + + legacy_recode (i->encoding, src, LEGACY_NATIVE, dst, MIN (src_size, dst_size)); + if (dst_size > src_size) + memset (&dst[src_size], ' ', dst_size - src_size); + return true; } @@ -632,6 +661,11 @@ parse_AHEX (struct data_in *i) return false; } + if (i->encoding != LEGACY_NATIVE) + { + hi = legacy_to_native (i->encoding, hi); + lo = legacy_to_native (i->encoding, lo); + } if (!c_isxdigit (hi) || !c_isxdigit (lo)) { data_warning (i, _("Field must contain only hex digits.")); diff --git a/src/data/data-in.h b/src/data/data-in.h index 779e8374..19812886 100644 --- a/src/data/data-in.h +++ b/src/data/data-in.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -30,7 +31,7 @@ void data_in_set_integer_format (enum integer_format); enum float_format data_in_get_float_format (void); void data_in_set_float_format (enum float_format); -bool data_in (struct substring input, +bool data_in (struct substring input, enum legacy_encoding, enum fmt_type, int implied_decimals, int first_column, union value *output, int width); diff --git a/src/data/data-out.c b/src/data/data-out.c index 967f57ed..02ff0657 100644 --- a/src/data/data-out.c +++ b/src/data/data-out.c @@ -92,11 +92,12 @@ static void output_hex (const void *, size_t bytes, char *); /* Converts the INPUT value into printable form in the exactly FORMAT->W characters in OUTPUT according to format - specification FORMAT. No null terminator is appended to the - buffer. */ + specification FORMAT. The output is recoded from native form + into the given legacy character ENCODING. No null terminator + is appended to the buffer. */ void -data_out (const union value *input, const struct fmt_spec *format, - char *output) +data_out_legacy (const union value *input, enum legacy_encoding encoding, + const struct fmt_spec *format, char *output) { static data_out_converter_func *const converters[FMT_NUMBER_OF_FORMATS] = { @@ -107,6 +108,17 @@ data_out (const union value *input, const struct fmt_spec *format, assert (fmt_check_output (format)); converters[format->type] (input, format, output); + if (encoding != LEGACY_NATIVE + && fmt_get_category (format->type) != FMT_CAT_BINARY) + legacy_recode (LEGACY_NATIVE, output, encoding, output, format->w); +} + +/* Same as data_out_legacy with ENCODING set to LEGACY_NATIVE. */ +void +data_out (const union value *value, const struct fmt_spec *format, + char *output) +{ + return data_out_legacy (value, LEGACY_NATIVE, format, output); } /* Returns the current output integer format. */ diff --git a/src/data/data-out.h b/src/data/data-out.h index 02a2c1cc..85831b30 100644 --- a/src/data/data-out.h +++ b/src/data/data-out.h @@ -20,12 +20,16 @@ #include #include #include +#include struct fmt_spec; union value; void data_out (const union value *, const struct fmt_spec *, char *); +void data_out_legacy (const union value *, enum legacy_encoding, + const struct fmt_spec *, char *); + enum integer_format data_out_get_integer_format (void); void data_out_set_integer_format (enum integer_format); diff --git a/src/data/file-handle-def.c b/src/data/file-handle-def.c index 0f6b142a..5d807ab9 100644 --- a/src/data/file-handle-def.c +++ b/src/data/file-handle-def.c @@ -49,6 +49,7 @@ struct file_handle /* FH_REF_FILE only. */ char *file_name; /* File name as provided by user. */ enum fh_mode mode; /* File mode. */ + enum legacy_encoding encoding;/* File encoding. */ /* FH_REF_FILE and FH_REF_INLINE only. */ size_t record_width; /* Length of fixed-format records. */ @@ -234,6 +235,7 @@ fh_create_file (const char *id, const char *file_name, handle->mode = properties->mode; handle->record_width = properties->record_width; handle->tab_width = properties->tab_width; + handle->encoding = properties->encoding; return handle; } @@ -254,7 +256,7 @@ const struct fh_properties * fh_default_properties (void) { static const struct fh_properties default_properties - = {FH_MODE_TEXT, 1024, 4}; + = {FH_MODE_TEXT, 1024, 4, LEGACY_NATIVE}; return &default_properties; } @@ -322,6 +324,14 @@ fh_get_tab_width (const struct file_handle *handle) return handle->tab_width; } +/* Returns the encoding of characters read from HANDLE. */ +enum legacy_encoding +fh_get_legacy_encoding (const struct file_handle *handle) +{ + assert (handle->referent & (FH_REF_FILE | FH_REF_INLINE)); + return (handle->referent == FH_REF_FILE ? handle->encoding : LEGACY_NATIVE); +} + /* Returns the scratch file handle associated with HANDLE. Applicable to only FH_REF_SCRATCH files. */ struct scratch_handle * diff --git a/src/data/file-handle-def.h b/src/data/file-handle-def.h index 79f0c52a..73e118cd 100644 --- a/src/data/file-handle-def.h +++ b/src/data/file-handle-def.h @@ -19,6 +19,7 @@ #include #include +#include /* What a file handle refers to. (Ordinarily only a single value is allowed, but fh_open() @@ -34,7 +35,10 @@ enum fh_referent enum fh_mode { FH_MODE_TEXT, /* New-line delimited lines. */ - FH_MODE_BINARY /* Fixed-length records. */ + FH_MODE_FIXED, /* Fixed-length records. */ + FH_MODE_VARIABLE, /* Binary variable-length records. */ + FH_MODE_360_VARIABLE, /* IBM 360 variable-length records. */ + FH_MODE_360_SPANNED, /* IBM 360 variable-length, spanned records. */ }; /* Ways to access a file. */ @@ -50,6 +54,7 @@ struct fh_properties enum fh_mode mode; /* File mode. */ size_t record_width; /* Length of fixed-format records. */ size_t tab_width; /* Tab width, 0=do not expand tabs. */ + enum legacy_encoding encoding;/* ASCII or EBCDIC? */ }; void fh_init (void); @@ -84,6 +89,7 @@ enum fh_mode fh_get_mode (const struct file_handle *) ; /* Properties of FH_REF_FILE and FH_REF_INLINE file handles. */ size_t fh_get_record_width (const struct file_handle *); size_t fh_get_tab_width (const struct file_handle *); +enum legacy_encoding fh_get_legacy_encoding (const struct file_handle *); /* Properties of FH_REF_SCRATCH file handles. */ struct scratch_handle *fh_get_scratch_handle (const struct file_handle *); diff --git a/src/language/data-io/ChangeLog b/src/language/data-io/ChangeLog index b58f9f7c..dcc0961f 100644 --- a/src/language/data-io/ChangeLog +++ b/src/language/data-io/ChangeLog @@ -1,3 +1,33 @@ +2007-11-08 Ben Pfaff + + Patch #6256: add support for binary, 360 file formats. Reviewed + by John Darrington. + + * data-reader.c (struct dfm_reader): New member `block_left'. + (dfm_open_reader): Initialize block_left. For FH_MODE_TEXT, open + the file in text mode. + (read_error): New function. + (partial_record): New function. + (try_to_read_fully): New function. + (enum descriptor_type): New enum. + (read_descriptor_word): New function. + (corrupt_size): New function. + (read_size): New function. + (read_file_record): Implement new modes. + (read_record): Now take care of tracking line numbers here. + (dfm_reader_get_legacy_encoding): New function. + + * data-writer.c (dfm_put_record): Implement new modes. + (dfm_writer_get_legacy_encoding): New function. + + * file-handle.q: Parse new formats. + (cmd_file_handle): Set up new formats. + + * print.c (struct print_trns): New member `encoding'. + (internal_cmd_print): Set encoding. + (print_trns_proc): Recode output data if necessary. + (flush_records): Recode leader byte. + 2007-11-03 Ben Pfaff Allow output files to overwrite input files (bug #21280). diff --git a/src/language/data-io/data-list.c b/src/language/data-io/data-list.c index 3ee524d5..4fea7a40 100644 --- a/src/language/data-io/data-list.c +++ b/src/language/data-io/data-list.c @@ -673,6 +673,7 @@ read_from_data_list (const struct data_list_pgm *dls, struct ccase *c) static bool read_from_data_list_fixed (const struct data_list_pgm *dls, struct ccase *c) { + enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (dls->reader); struct dls_var_spec *spec; int row; @@ -698,9 +699,10 @@ read_from_data_list_fixed (const struct data_list_pgm *dls, struct ccase *c) if (row < spec->record) break; - data_in (ss_substr (line, spec->first_column - 1, spec->input.w), - spec->input.type, spec->input.d, spec->first_column, - case_data_rw_idx (c, spec->fv), + data_in (ss_substr (line, spec->first_column - 1, + spec->input.w), + encoding, spec->input.type, spec->input.d, + spec->first_column, case_data_rw_idx (c, spec->fv), fmt_var_width (&spec->input)); } @@ -716,6 +718,7 @@ read_from_data_list_fixed (const struct data_list_pgm *dls, struct ccase *c) static bool read_from_data_list_free (const struct data_list_pgm *dls, struct ccase *c) { + enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (dls->reader); struct dls_var_spec *spec; ll_for_each (spec, struct dls_var_spec, ll, &dls->specs) @@ -736,7 +739,7 @@ read_from_data_list_free (const struct data_list_pgm *dls, struct ccase *c) } } - data_in (field, spec->input.type, 0, + data_in (field, encoding, spec->input.type, 0, dfm_get_column (dls->reader, ss_data (field)), case_data_rw_idx (c, spec->fv), fmt_var_width (&spec->input)); } @@ -749,6 +752,7 @@ read_from_data_list_free (const struct data_list_pgm *dls, struct ccase *c) static bool read_from_data_list_list (const struct data_list_pgm *dls, struct ccase *c) { + enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (dls->reader); struct dls_var_spec *spec; if (dfm_eof (dls->reader)) @@ -776,7 +780,7 @@ read_from_data_list_list (const struct data_list_pgm *dls, struct ccase *c) break; } - data_in (field, spec->input.type, 0, + data_in (field, encoding, spec->input.type, 0, dfm_get_column (dls->reader, ss_data (field)), case_data_rw_idx (c, spec->fv), fmt_var_width (&spec->input)); } diff --git a/src/language/data-io/data-reader.c b/src/language/data-io/data-reader.c index c4e71830..e3ef8ca1 100644 --- a/src/language/data-io/data-reader.c +++ b/src/language/data-io/data-reader.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,9 @@ struct dfm_reader size_t pos; /* Offset in line of current character. */ unsigned eof_cnt; /* # of attempts to advance past EOF. */ struct lexer *lexer; /* The lexer reading the file */ + + /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */ + size_t block_left; /* Bytes left in current block. */ }; /* Closes reader R opened by dfm_open_reader(). */ @@ -130,11 +134,13 @@ dfm_open_reader (struct file_handle *fh, struct lexer *lexer) ds_init_empty (&r->scratch); r->flags = DFM_ADVANCE; r->eof_cnt = 0; + r->block_left = 0; if (fh_get_referent (fh) != FH_REF_INLINE) { r->where.file_name = fh_get_file_name (fh); r->where.line_number = 0; - r->file = fn_open (fh_get_file_name (fh), "rb"); + r->file = fn_open (fh_get_file_name (fh), + fh_get_mode (fh) == FH_MODE_TEXT ? "r" : "rb"); if (r->file == NULL) { msg (ME, _("Could not open \"%s\" for reading as a data file: %s."), @@ -195,46 +201,268 @@ read_inline_record (struct dfm_reader *r) return true; } +/* Report a read error or unexpected end-of-file condition on R. */ +static void +read_error (struct dfm_reader *r) +{ + if (ferror (r->file)) + msg (ME, _("Error reading file %s: %s."), + fh_get_name (r->fh), strerror (errno)); + else if (feof (r->file)) + msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh)); + else + NOT_REACHED (); +} + +/* Report a partial read at end of file reading R. */ +static void +partial_record (struct dfm_reader *r) +{ + msg (ME, _("Unexpected end of file in partial record reading %s."), + fh_get_name (r->fh)); +} + +/* Tries to read SIZE bytes from R into BUFFER. Returns 1 if + successful, 0 if end of file was reached before any bytes + could be read, and -1 if some bytes were read but fewer than + SIZE due to end of file or an error mid-read. In the latter + case, reports an error. */ +static int +try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size) +{ + size_t bytes_read = fread (buffer, 1, size, r->file); + if (bytes_read == size) + return 1; + else if (bytes_read == 0) + return 0; + else + { + partial_record (r); + return -1; + } +} + +/* Type of a descriptor word. */ +enum descriptor_type + { + BLOCK, + RECORD + }; + +/* Reads a block descriptor word or record descriptor word + (according to TYPE) from R. Returns 1 if successful, 0 if + end of file was reached before any bytes could be read, -1 if + an error occurred. Reports an error in the latter case. + + If successful, stores the number of remaining bytes in the + block or record (that is, the block or record length, minus + the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE. + If SEGMENT is nonnull, also stores the segment control + character (SCC) into *SEGMENT. */ +static int +read_descriptor_word (struct dfm_reader *r, enum descriptor_type type, + size_t *remaining_size, int *segment) +{ + uint8_t raw_descriptor[4]; + int status; + + status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor); + if (status <= 0) + return status; + + *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1]; + if (segment != NULL) + *segment = raw_descriptor[2]; + + if (*remaining_size < 4) + { + msg (ME, + (type == BLOCK + ? _("Corrupt block descriptor word at offset 0x%lx in %s.") + : _("Corrupt record descriptor word at offset 0x%lx in %s.")), + (long) ftello (r->file) - 4, fh_get_name (r->fh)); + return -1; + } + + *remaining_size -= 4; + return 1; +} + +/* Reports that reader R has read a corrupt record size. */ +static void +corrupt_size (struct dfm_reader *r) +{ + msg (ME, _("Corrupt record size at offset 0x%lx in %s."), + (long) ftello (r->file) - 4, fh_get_name (r->fh)); +} + +/* Reads a 32-byte little-endian signed number from R and stores + its value into *SIZE_OUT. Returns 1 if successful, 0 if end + of file was reached before any bytes could be read, -1 if an + error occurred. Reports an error in the latter case. Numbers + less than 0 are considered errors. */ +static int +read_size (struct dfm_reader *r, size_t *size_out) +{ + int32_t size; + int status; + + status = try_to_read_fully (r, &size, sizeof size); + if (status <= 0) + return status; + + integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size, + sizeof size); + if (size < 0) + { + corrupt_size (r); + return -1; + } + + *size_out = size; + return 1; +} + /* Reads a record from a disk file into R. - Returns true if successful, false on failure. */ + Returns true if successful, false on error or at end of file. */ static bool read_file_record (struct dfm_reader *r) { assert (r->fh != fh_inline_file ()); + ds_clear (&r->line); - if (fh_get_mode (r->fh) == FH_MODE_TEXT) + switch (fh_get_mode (r->fh)) { - if (!ds_read_line (&r->line, r->file)) + case FH_MODE_TEXT: + if (ds_read_line (&r->line, r->file)) + { + ds_chomp (&r->line, '\n'); + return true; + } + else { if (ferror (r->file)) - msg (ME, _("Error reading file %s: %s."), - fh_get_name (r->fh), strerror (errno)); + read_error (r); return false; } - ds_chomp (&r->line, '\n'); - } - else if (fh_get_mode (r->fh) == FH_MODE_BINARY) - { - size_t record_width = fh_get_record_width (r->fh); - size_t amt = ds_read_stream (&r->line, 1, record_width, r->file); - if (record_width != amt) + return true; + + case FH_MODE_FIXED: + if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file)) + return true; + else { if (ferror (r->file)) - msg (ME, _("Error reading file %s: %s."), - fh_get_name (r->fh), strerror (errno)); - else if (amt != 0) - msg (ME, _("%s: Partial record at end of file."), - fh_get_name (r->fh)); + read_error (r); + else if (!ds_is_empty (&r->line)) + partial_record (r); + return false; + } + return true; + case FH_MODE_VARIABLE: + { + size_t leading_size; + size_t trailing_size; + int status; + + /* Read leading record size. */ + status = read_size (r, &leading_size); + if (status <= 0) return false; + + /* Read record data. */ + if (!ds_read_stream (&r->line, leading_size, 1, r->file)) + { + if (ferror (r->file)) + read_error (r); + else + partial_record (r); + return false; + } + + /* Read trailing record size and check that it's the same + as the leading record size. */ + status = read_size (r, &trailing_size); + if (status <= 0) + { + if (status == 0) + partial_record (r); + return false; + } + if (leading_size != trailing_size) + { + corrupt_size (r); + return false; + } + + return true; + } + + case FH_MODE_360_VARIABLE: + case FH_MODE_360_SPANNED: + for (;;) + { + size_t record_size; + int segment; + int status; + + /* If we've exhausted our current block, start another + one by reading the new block descriptor word. */ + if (r->block_left == 0) + { + status = read_descriptor_word (r, BLOCK, &r->block_left, NULL); + if (status < 0) + return false; + else if (status == 0) + return !ds_is_empty (&r->line); + } + + /* Read record descriptor. */ + if (r->block_left < 4) + { + partial_record (r); + return false; + } + r->block_left -= 4; + status = read_descriptor_word (r, RECORD, &record_size, &segment); + if (status <= 0) + { + if (status == 0) + partial_record (r); + return false; + } + if (record_size > r->block_left) + { + msg (ME, _("Record exceeds remaining block length.")); + return false; + } + + /* Read record data. */ + if (!ds_read_stream (&r->line, record_size, 1, r->file)) + { + if (ferror (r->file)) + read_error (r); + else + partial_record (r); + return false; + } + r->block_left -= record_size; + + /* In variable mode, read only a single record. + In spanned mode, a segment value of 0 should + designate a whole record without spanning, 1 the + first segment in a record, 2 the last segment in a + record, and 3 an intermediate segment in a record. + For compatibility, though, we actually pay attention + only to whether the segment value is even or odd. */ + if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE + || (segment & 1) == 0) + return true; } } - else - NOT_REACHED (); - - r->where.line_number++; - return true; + NOT_REACHED (); } /* Reads a record from R, setting the current position to the @@ -243,9 +471,15 @@ read_file_record (struct dfm_reader *r) static bool read_record (struct dfm_reader *r) { - return (fh_get_referent (r->fh) == FH_REF_FILE - ? read_file_record (r) - : read_inline_record (r)); + if (fh_get_referent (r->fh) == FH_REF_FILE) + { + bool ok = read_file_record (r); + if (ok) + r->where.line_number++; + return ok; + } + else + return read_inline_record (r); } /* Returns the number of attempts, thus far, to advance past @@ -313,7 +547,7 @@ dfm_expand_tabs (struct dfm_reader *r) r->flags |= DFM_TABS_EXPANDED; if (r->fh != fh_inline_file () - && (fh_get_mode (r->fh) == FH_MODE_BINARY + && (fh_get_mode (r->fh) != FH_MODE_TEXT || fh_get_tab_width (r->fh) == 0 || ds_find_char (&r->line, '\t') == SIZE_MAX)) return; @@ -355,6 +589,13 @@ dfm_expand_tabs (struct dfm_reader *r) r->pos = new_pos; } +/* Returns the legacy character encoding of data read from READER. */ +enum legacy_encoding +dfm_reader_get_legacy_encoding (const struct dfm_reader *reader) +{ + return fh_get_legacy_encoding (reader->fh); +} + /* Causes dfm_get_record() or dfm_get_whole_record() to read in the next record the next time it is executed on file HANDLE. */ diff --git a/src/language/data-io/data-reader.h b/src/language/data-io/data-reader.h index d27fbcd4..029dacdf 100644 --- a/src/language/data-io/data-reader.h +++ b/src/language/data-io/data-reader.h @@ -25,6 +25,7 @@ #include #include +#include struct file_handle; struct string; @@ -37,6 +38,8 @@ bool dfm_reader_error (const struct dfm_reader *); unsigned dfm_eof (struct dfm_reader *); struct substring dfm_get_record (struct dfm_reader *); void dfm_expand_tabs (struct dfm_reader *); +enum legacy_encoding dfm_reader_get_legacy_encoding ( + const struct dfm_reader *); /* Line control. */ void dfm_forward_record (struct dfm_reader *); diff --git a/src/language/data-io/data-writer.c b/src/language/data-io/data-writer.c index 32caa1e2..b5df59d4 100644 --- a/src/language/data-io/data-writer.c +++ b/src/language/data-io/data-writer.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -27,6 +28,7 @@ #include #include #include +#include #include #include @@ -106,7 +108,7 @@ dfm_put_record (struct dfm_writer *w, const char *rec, size_t len) putc ('\n', w->file); break; - case FH_MODE_BINARY: + case FH_MODE_FIXED: { size_t record_width = fh_get_record_width (w->fh); size_t write_bytes = MIN (len, record_width); @@ -122,6 +124,45 @@ dfm_put_record (struct dfm_writer *w, const char *rec, size_t len) } break; + case FH_MODE_VARIABLE: + { + uint32_t size = len; + integer_convert (INTEGER_NATIVE, &size, INTEGER_LSB_FIRST, &size, + sizeof size); + fwrite (&size, sizeof size, 1, w->file); + fwrite (rec, len, 1, w->file); + fwrite (&size, sizeof size, 1, w->file); + } + break; + + case FH_MODE_360_VARIABLE: + case FH_MODE_360_SPANNED: + { + size_t ofs = 0; + if (fh_get_mode (w->fh) == FH_MODE_360_VARIABLE) + len = MIN (65527, len); + while (ofs < len) + { + size_t chunk = MIN (65527, len - ofs); + uint32_t bdw = (chunk + 8) << 16; + int scc = (ofs == 0 && chunk == len ? 0 + : ofs == 0 ? 1 + : ofs + chunk == len ? 2 + : 3); + uint32_t rdw = ((chunk + 4) << 16) | (scc << 8); + + integer_convert (INTEGER_NATIVE, &bdw, INTEGER_MSB_FIRST, &bdw, + sizeof bdw); + integer_convert (INTEGER_NATIVE, &rdw, INTEGER_MSB_FIRST, &rdw, + sizeof rdw); + fwrite (&bdw, 1, sizeof bdw, w->file); + fwrite (&rdw, 1, sizeof rdw, w->file); + fwrite (rec + ofs, 1, chunk, w->file); + ofs += chunk; + } + } + break; + default: NOT_REACHED (); } @@ -157,3 +198,10 @@ dfm_close_writer (struct dfm_writer *w) return ok; } + +/* Returns the legacy character encoding of data written to WRITER. */ +enum legacy_encoding +dfm_writer_get_legacy_encoding (const struct dfm_writer *writer) +{ + return fh_get_legacy_encoding (writer->fh); +} diff --git a/src/language/data-io/data-writer.h b/src/language/data-io/data-writer.h index 6e554654..2142f215 100644 --- a/src/language/data-io/data-writer.h +++ b/src/language/data-io/data-writer.h @@ -27,5 +27,7 @@ struct dfm_writer *dfm_open_writer (struct file_handle *); bool dfm_close_writer (struct dfm_writer *); bool dfm_write_error (const struct dfm_writer *); bool dfm_put_record (struct dfm_writer *, const char *rec, size_t len); +enum legacy_encoding dfm_writer_get_legacy_encoding ( + const struct dfm_writer *); #endif /* data-writer.h */ diff --git a/src/language/data-io/file-handle.q b/src/language/data-io/file-handle.q index d5301773..35fabc08 100644 --- a/src/language/data-io/file-handle.q +++ b/src/language/data-io/file-handle.q @@ -41,7 +41,8 @@ name=string; lrecl=integer; tabwidth=integer "x>=0" "%s must be nonnegative"; - mode=mode:!character/image/scratch. + mode=mode:!character/binary/image/360/scratch; + recform=recform:fixed/f/variable/v/spanned/vs. */ /* (declarations) */ /* (functions) */ @@ -50,8 +51,6 @@ int cmd_file_handle (struct lexer *lexer, struct dataset *ds) { char handle_name[LONG_NAME_LEN + 1]; - struct fh_properties properties = *fh_default_properties (); - struct cmd_file_handle cmd; struct file_handle *handle; @@ -78,39 +77,69 @@ cmd_file_handle (struct lexer *lexer, struct dataset *ds) if (lex_end_of_command (lexer) != CMD_SUCCESS) goto lossage; - if (cmd.s_name == NULL && cmd.mode != FH_SCRATCH) + if (cmd.mode != FH_SCRATCH) { - lex_sbc_missing (lexer, "NAME"); - goto lossage; - } + struct fh_properties properties = *fh_default_properties (); - switch (cmd.mode) - { - case FH_CHARACTER: - properties.mode = FH_MODE_TEXT; - if (cmd.sbc_tabwidth) - properties.tab_width = cmd.n_tabwidth[0]; - break; - case FH_IMAGE: - properties.mode = FH_MODE_BINARY; - if (cmd.n_lrecl[0] == LONG_MIN) - msg (SE, _("Fixed-length records were specified on /RECFORM, but " - "record length was not specified on /LRECL. " - "Assuming %zu-character records."), - properties.record_width); - else if (cmd.n_lrecl[0] < 1) - msg (SE, _("Record length (%ld) must be at least one byte. " - "Assuming %zu-character records."), - cmd.n_lrecl[0], properties.record_width); - else - properties.record_width = cmd.n_lrecl[0]; - break; - default: - NOT_REACHED (); - } + if (cmd.s_name == NULL) + { + lex_sbc_missing (lexer, "NAME"); + goto lossage; + } - if (cmd.mode != FH_SCRATCH) - fh_create_file (handle_name, cmd.s_name, &properties); + switch (cmd.mode) + { + case FH_CHARACTER: + properties.mode = FH_MODE_TEXT; + if (cmd.sbc_tabwidth) + properties.tab_width = cmd.n_tabwidth[0]; + break; + case FH_IMAGE: + properties.mode = FH_MODE_FIXED; + break; + case FH_BINARY: + properties.mode = FH_MODE_VARIABLE; + break; + case FH_360: + properties.encoding = LEGACY_EBCDIC; + if (cmd.recform == FH_FIXED || cmd.recform == FH_F) + properties.mode = FH_MODE_FIXED; + else if (cmd.recform == FH_VARIABLE || cmd.recform == FH_V) + { + properties.mode = FH_MODE_360_VARIABLE; + properties.record_width = 8192; + } + else if (cmd.recform == FH_SPANNED || cmd.recform == FH_VS) + { + properties.mode = FH_MODE_360_SPANNED; + properties.record_width = 8192; + } + else + { + msg (SE, _("RECFORM must be specified with MODE=360.")); + goto lossage; + } + break; + default: + NOT_REACHED (); + } + + if (properties.mode == FH_MODE_FIXED || cmd.n_lrecl[0] != LONG_MIN) + { + if (cmd.n_lrecl[0] == LONG_MIN) + msg (SE, _("The specified file mode requires LRECL. " + "Assuming %d-character records."), + properties.record_width); + else if (cmd.n_lrecl[0] < 1 || cmd.n_lrecl[0] >= (1UL << 31)) + msg (SE, _("Record length (%ld) must be between 1 and %lu bytes. " + "Assuming %d-character records."), + cmd.n_lrecl[0], (1UL << 31) - 1, properties.record_width); + else + properties.record_width = cmd.n_lrecl[0]; + } + + fh_create_file (handle_name, cmd.s_name, &properties); + } else fh_create_scratch (handle_name); diff --git a/src/language/data-io/print.c b/src/language/data-io/print.c index 86d69ea9..edbe2fae 100644 --- a/src/language/data-io/print.c +++ b/src/language/data-io/print.c @@ -82,6 +82,7 @@ struct print_trns struct pool *pool; /* Stores related data. */ bool eject; /* Eject page before printing? */ bool include_prefix; /* Prefix lines with space? */ + enum legacy_encoding encoding; /* Encoding to use for output. */ struct dfm_writer *writer; /* Output file, NULL=listing file. */ struct ll_list specs; /* List of struct prt_out_specs. */ size_t record_cnt; /* Number of records to write. */ @@ -194,7 +195,10 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds, trns->writer = dfm_open_writer (fh); if (trns->writer == NULL) goto error; + trns->encoding = dfm_writer_get_legacy_encoding (trns->writer); } + else + trns->encoding = LEGACY_NATIVE; /* Output the variable table if requested. */ if (print_table) @@ -448,6 +452,7 @@ print_trns_proc (void *trns_, struct ccase *c, casenumber case_num UNUSED) { struct print_trns *trns = trns_; bool eject = trns->eject; + char encoded_space = legacy_from_native (trns->encoding, ' '); int record = 1; struct prt_out_spec *spec; @@ -457,20 +462,29 @@ print_trns_proc (void *trns_, struct ccase *c, casenumber case_num UNUSED) { flush_records (trns, spec->record, &eject, &record); - ds_set_length (&trns->line, spec->first_column, ' '); + ds_set_length (&trns->line, spec->first_column, encoded_space); if (spec->type == PRT_VAR) { const union value *input = case_data (c, spec->var); char *output = ds_put_uninit (&trns->line, spec->format.w); if (!spec->sysmis_as_spaces || input->f != SYSMIS) - data_out (input, &spec->format, output); + data_out_legacy (input, trns->encoding, &spec->format, output); else - memset (output, ' ', spec->format.w); + memset (output, encoded_space, spec->format.w); if (spec->add_space) - ds_put_char (&trns->line, ' '); + ds_put_char (&trns->line, encoded_space); } else - ds_put_substring (&trns->line, ds_ss (&spec->string)); + { + ds_put_substring (&trns->line, ds_ss (&spec->string)); + if (trns->encoding != LEGACY_NATIVE) + { + size_t length = ds_length (&spec->string); + char *data = ss_data (ds_tail (&trns->line, length)); + legacy_recode (LEGACY_NATIVE, data, + trns->encoding, data, length); + } + } } flush_records (trns, trns->record_cnt + 1, &eject, &record); @@ -501,7 +515,7 @@ flush_records (struct print_trns *trns, int target_record, else leader = '1'; } - line[0] = leader; + line[0] = legacy_from_native (trns->encoding, leader); if (trns->writer == NULL) tab_output_text (TAB_FIX | TAT_NOWRAP, &line[1]); diff --git a/src/language/expressions/operations.def b/src/language/expressions/operations.def index 2e94d2ec..3056b886 100644 --- a/src/language/expressions/operations.def +++ b/src/language/expressions/operations.def @@ -573,7 +573,7 @@ string function RTRIM (string s, string c) function NUMBER (string s, ni_format f) { union value out; - data_in (ss_head (s, f->w), f->type, f->d, 0, &out, 0); + data_in (ss_head (s, f->w), LEGACY_NATIVE, f->type, f->d, 0, &out, 0); return out.f; } diff --git a/src/language/lexer/range-parser.c b/src/language/lexer/range-parser.c index 42395184..6a3af097 100644 --- a/src/language/lexer/range-parser.c +++ b/src/language/lexer/range-parser.c @@ -98,7 +98,8 @@ parse_number (struct lexer *lexer, double *x, const enum fmt_type *format) else if (lex_token (lexer) == T_STRING && format != NULL) { union value v; - data_in (ds_ss (lex_tokstr (lexer)), *format, 0, 0, &v, 0); + data_in (ds_ss (lex_tokstr (lexer)), LEGACY_NATIVE, + *format, 0, 0, &v, 0); lex_get (lexer); *x = v.f; if (*x == SYSMIS) diff --git a/src/language/xforms/recode.c b/src/language/xforms/recode.c index f4ae92f4..1cb29b84 100644 --- a/src/language/xforms/recode.c +++ b/src/language/xforms/recode.c @@ -606,7 +606,8 @@ find_src_string (struct recode_trns *trns, const char *value, int width) union value uv; msg_disable (); - match = data_in (ss_buffer (value, width), FMT_F, 0, 0, &uv, 0); + match = data_in (ss_buffer (value, width), LEGACY_NATIVE, + FMT_F, 0, 0, &uv, 0); msg_enable (); out->value.f = uv.f; break; diff --git a/src/libpspp/ChangeLog b/src/libpspp/ChangeLog index 4d9186f5..d237d1fe 100644 --- a/src/libpspp/ChangeLog +++ b/src/libpspp/ChangeLog @@ -1,3 +1,8 @@ +2007-11-08 Ben Pfaff + + * str.c (ds_read_stream): Change return value semantics to be more + useful. Update all users. + 2007-11-03 John Darrington * i18n.c i18n.h: Added convertor from UTF8 to system. diff --git a/src/libpspp/str.c b/src/libpspp/str.c index 6e446f85..225c559d 100644 --- a/src/libpspp/str.c +++ b/src/libpspp/str.c @@ -19,6 +19,7 @@ #include "str.h" #include +#include #include #include @@ -1217,8 +1218,8 @@ ds_read_config_line (struct string *st, int *line_number, FILE *stream) /* Attempts to read SIZE * CNT bytes from STREAM and append them to ST. - Returns number of bytes actually read. */ -size_t + Returns true if all the requested data was read, false otherwise. */ +bool ds_read_stream (struct string *st, size_t size, size_t cnt, FILE *stream) { if (size != 0) @@ -1227,12 +1228,18 @@ ds_read_stream (struct string *st, size_t size, size_t cnt, FILE *stream) if (size_in_bounds_p (xsum (ds_length (st), try_bytes))) { char *buffer = ds_put_uninit (st, try_bytes); - size_t got_bytes = fread (buffer, size, cnt, stream); + size_t got_bytes = fread (buffer, 1, try_bytes, stream); ds_truncate (st, ds_length (st) - (try_bytes - got_bytes)); - return got_bytes; + return got_bytes == try_bytes; + } + else + { + errno = ENOMEM; + return false; } } - return 0; + else + return true; } /* Concatenates S onto ST. */ diff --git a/src/libpspp/str.h b/src/libpspp/str.h index 4a806373..8357732c 100644 --- a/src/libpspp/str.h +++ b/src/libpspp/str.h @@ -202,7 +202,7 @@ char *ds_cstr (const struct string *); /* File input. */ bool ds_read_line (struct string *, FILE *); bool ds_read_config_line (struct string *, int *line_number, FILE *); -size_t ds_read_stream (struct string *, size_t size, size_t cnt, FILE *stream); +bool ds_read_stream (struct string *, size_t size, size_t cnt, FILE *stream); /* Append. */ void ds_put_char (struct string *, int ch); diff --git a/src/ui/gui/find-dialog.c b/src/ui/gui/find-dialog.c index 0fc43620..432afa15 100644 --- a/src/ui/gui/find-dialog.c +++ b/src/ui/gui/find-dialog.c @@ -597,6 +597,7 @@ value_comparator_create (const struct variable *var, const char *target) vc->pattern = value_create (width); if ( ! data_in (ss_cstr (target), + LEGACY_NATIVE, fmt->type, 0, 0, vc->pattern, width) ) diff --git a/src/ui/gui/helper.c b/src/ui/gui/helper.c index c322321d..5ed0e988 100644 --- a/src/ui/gui/helper.c +++ b/src/ui/gui/helper.c @@ -91,7 +91,7 @@ text_to_value (const gchar *text, union value *v, } msg_disable (); - ok = data_in (ss_cstr (text), format.type, 0, 0, + ok = data_in (ss_cstr (text), LEGACY_NATIVE, format.type, 0, 0, v, fmt_var_width (&format)); msg_enable (); diff --git a/src/ui/gui/psppire-case-file.c b/src/ui/gui/psppire-case-file.c index eef28a1f..8b194528 100644 --- a/src/ui/gui/psppire-case-file.c +++ b/src/ui/gui/psppire-case-file.c @@ -294,7 +294,7 @@ psppire_case_file_data_in (PsppireCaseFile *cf, casenumber casenum, gint idx, width = fmt_var_width (fmt); value = xmalloca (value_cnt_from_width (width) * sizeof *value); ok = (datasheet_get_value (cf->datasheet, casenum, idx, value, width) - && data_in (input, fmt->type, 0, 0, value, width) + && data_in (input, LEGACY_NATIVE, fmt->type, 0, 0, value, width) && datasheet_put_value (cf->datasheet, casenum, idx, value, width)); if (ok) diff --git a/tests/ChangeLog b/tests/ChangeLog index 3137a02d..eeee122e 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,12 @@ +2007-11-08 Ben Pfaff + + Patch #6256: add support for binary, 360 file formats. Reviewed + by John Darrington. + + * automake.mk: Add new file. + + * formats/360.sh: New test. + 2007-11-07 Ben Pfaff * bugs/overwrite-input-file.sh: Don't use non-portable "diff -B". diff --git a/tests/automake.mk b/tests/automake.mk index 4d5891dd..7f461fa9 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -79,6 +79,7 @@ dist_TESTS = \ tests/formats/time-out.sh \ tests/formats/wkday-in.sh \ tests/formats/wkday-out.sh \ + tests/formats/360.sh \ tests/bugs/agg_crash.sh \ tests/bugs/agg-crash-2.sh \ tests/bugs/alpha-freq.sh \ diff --git a/tests/formats/360.sh b/tests/formats/360.sh new file mode 100755 index 00000000..c813e648 --- /dev/null +++ b/tests/formats/360.sh @@ -0,0 +1,254 @@ +#! /bin/sh + +# Tests BINARY and 360 data file formats. + +TEMPDIR=/tmp/pspp-tst-$$ + +# ensure that top_builddir are absolute +if [ -z "$top_builddir" ] ; then top_builddir=. ; fi +if [ -z "$top_srcdir" ] ; then top_srcdir=. ; fi +top_builddir=`cd $top_builddir; pwd` +PSPP=$top_builddir/src/ui/terminal/pspp +: ${PERL:=perl} + +# ensure that top_srcdir is absolute +top_srcdir=`cd $top_srcdir; pwd` + +STAT_CONFIG_PATH=$top_srcdir/config +export STAT_CONFIG_PATH + + +cleanup() +{ + cd / + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + + + +cd $TEMPDIR + +activity="create data file" +cat > $TEMPDIR/input.data < $TEMPDIR/make-360.pl <<'EOF' +use strict; +use warnings; + +# ASCII to EBCDIC translation table +our ($ascii2ebcdic) = "" +. "\x00\x01\x02\x03\x37\x2d\x2e\x2f" +. "\x16\x05\x25\x0b\x0c\x0d\x0e\x0f" +. "\x10\x11\x12\x13\x3c\x3d\x32\x26" +. "\x18\x19\x3f\x27\x1c\x1d\x1e\x1f" +. "\x40\x5a\x7f\x7b\x5b\x6c\x50\x7d" +. "\x4d\x5d\x5c\x4e\x6b\x60\x4b\x61" +. "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" +. "\xf8\xf9\x7a\x5e\x4c\x7e\x6e\x6f" +. "\x7c\xc1\xc2\xc3\xc4\xc5\xc6\xc7" +. "\xc8\xc9\xd1\xd2\xd3\xd4\xd5\xd6" +. "\xd7\xd8\xd9\xe2\xe3\xe4\xe5\xe6" +. "\xe7\xe8\xe9\xad\xe0\xbd\x9a\x6d" +. "\x79\x81\x82\x83\x84\x85\x86\x87" +. "\x88\x89\x91\x92\x93\x94\x95\x96" +. "\x97\x98\x99\xa2\xa3\xa4\xa5\xa6" +. "\xa7\xa8\xa9\xc0\x4f\xd0\x5f\x07" +. "\x20\x21\x22\x23\x24\x15\x06\x17" +. "\x28\x29\x2a\x2b\x2c\x09\x0a\x1b" +. "\x30\x31\x1a\x33\x34\x35\x36\x08" +. "\x38\x39\x3a\x3b\x04\x14\x3e\xe1" +. "\x41\x42\x43\x44\x45\x46\x47\x48" +. "\x49\x51\x52\x53\x54\x55\x56\x57" +. "\x58\x59\x62\x63\x64\x65\x66\x67" +. "\x68\x69\x70\x71\x72\x73\x74\x75" +. "\x76\x77\x78\x80\x8a\x8b\x8c\x8d" +. "\x8e\x8f\x90\x6a\x9b\x9c\x9d\x9e" +. "\x9f\xa0\xaa\xab\xac\x4a\xae\xaf" +. "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" +. "\xb8\xb9\xba\xbb\xbc\xa1\xbe\xbf" +. "\xca\xcb\xcc\xcd\xce\xcf\xda\xdb" +. "\xdc\xdd\xde\xdf\xea\xeb\xec\xed" +. "\xee\xef\xfa\xfb\xfc\xfd\xfe\xff"; +length ($ascii2ebcdic) == 256 || die; + +open (INPUT, '<', 'input.data') or die "input.data: open: $!\n"; +my (@data) = or die; +close (INPUT) or die; +chomp $_ foreach @data; + +# Binary mode. +open (OUTPUT, '>', 'binary.bin') or die "binary.bin: create: $!\n"; +for $_ (@data) { + my ($reclen) = pack ("V", length); + print OUTPUT $reclen, $_, $reclen; +} +close (OUTPUT) or die; + +# Fixed mode. +open (OUTPUT, '>', 'fixed.bin') or die "fixed.bin: create: $!\n"; +my ($lrecl) = 32; +for $_ (@data) { + my ($out) = substr ($_, 0, $lrecl); + $out .= ' ' x ($lrecl - length ($out)); + length ($out) == 32 or die; + print OUTPUT a2e ($out); +} +close (OUTPUT) or die; + +# Variable mode. +open (OUTPUT, '>', 'variable.bin') or die "variable.bin: create: $!\n"; +our (@records); +for $_ (@data) { + push (@records, pack ("n xx", length ($_) + 4) . a2e ($_)); +} +dump_records (); +close (OUTPUT) or die; + +# Spanned mode. +open (OUTPUT, '>', 'spanned.bin') or die "spanned.bin: create: $!\n"; +for my $line (@data) { + local ($_) = $line; + my (@r); + while (length) { + my ($n) = min (int (rand (5)), length); + push (@r, substr ($_, 0, $n, '')); + } + foreach my $i (0...$#r) { + my $scc = ($#r == 0 ? 0 + : $i == 0 ? 1 + : $i == $#r ? 2 + : 3); + push (@records, + pack ("nCx", length ($r[$i]) + 4, $scc) . a2e ($r[$i])); + } +} +dump_records (); +close (OUTPUT) or die; + +sub a2e { + local ($_) = @_; + my ($s) = ""; + foreach (split (//)) { + $s .= substr ($ascii2ebcdic, ord, 1); + } + return $s; +} + +sub min { + my ($a, $b) = @_; + return $a < $b ? $a : $b +} + +sub dump_records { + while (@records) { + my ($n) = min (int (rand (5)) + 1, scalar (@records)); + my (@r) = splice (@records, 0, $n); + my ($len) = 0; + $len += length foreach @r; + print OUTPUT pack ("n xx", $len + 4); + print OUTPUT foreach @r; + } +} +EOF +if [ $? -ne 0 ] ; then no_result ; fi + +activity="running make-360.pl" +$PERL make-360.pl +if [ $? -ne 0 ] ; then no_result ; fi + +binary_fh='mode=binary' +fixed_fh='mode=360 /recform=fixed /lrecl=32' +variable_fh='mode=360 /recform=variable' +spanned_fh='mode=360 /recform=spanned' + +for type in binary fixed variable spanned; do + activity="create $type.pspp" + eval fh=\$${type}_fh + cat > $type.pspp <