From 5135a2c6c97e5dccab6010d19473003bcdb0cc9f Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 16 Apr 2012 23:04:02 -0700 Subject: [PATCH] work on data list passes most tests still need to: * add ENCODING subcommand to PRINT * add tests * document --- doc/data-io.texi | 22 ++++- src/data/file-handle-def.c | 30 ++++--- src/data/file-handle-def.h | 4 +- src/language/data-io/data-list.c | 13 +-- src/language/data-io/data-parser.c | 6 +- src/language/data-io/data-reader.c | 129 +++++++++++++++++++++-------- src/language/data-io/data-reader.h | 7 +- src/language/data-io/data-writer.c | 50 ++++++++--- src/language/data-io/data-writer.h | 7 +- src/language/data-io/file-handle.q | 16 ++-- src/language/data-io/get-data.c | 20 ++++- src/language/data-io/inpt-pgm.c | 35 +++++--- src/language/data-io/print-space.c | 16 +++- src/language/data-io/print.c | 16 +++- 14 files changed, 268 insertions(+), 103 deletions(-) diff --git a/doc/data-io.texi b/doc/data-io.texi index 4862ccc596..142c725b20 100644 --- a/doc/data-io.texi +++ b/doc/data-io.texi @@ -277,8 +277,9 @@ external file. It may be used to specify a file name as a string or a file handle (@pxref{File Handles}). If the @subcmd{FILE} subcommand is not used, then input is assumed to be specified within the command file using @cmd{BEGIN DATA}@dots{}@cmd{END DATA} (@pxref{BEGIN DATA}). -The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE} subcommand is also used. -It specifies the character encoding of the file. +The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE} +subcommand is also used. It specifies the character encoding of the +file. @xref{INSERT}, for information on supported encodings. The optional @subcmd{RECORDS} subcommand, which takes a single integer as an argument, is used to specify the number of lines per record. @@ -503,7 +504,8 @@ of quoting is allowed. The @subcmd{NOTABLE} and @subcmd{TABLE} subcommands are as in @cmd{DATA LIST FIXED} above. @subcmd{NOTABLE} is the default. -The @subcmd{FILE} and @subcmd{SKIP} subcommands are as in @cmd{DATA LIST FIXED} above. +The @subcmd{FILE}, @subcmd{SKIP}, and @subcmd{ENCODING} subcommands +are as in @cmd{DATA LIST FIXED} above. The variables to be parsed are given as a single list of variable names. This list must be introduced by a single slash (@samp{/}). The set of @@ -525,7 +527,7 @@ on field width apply, but they are honored on output. DATA LIST LIST [(@{TAB,'@var{c}'@}, @dots{})] [@{NOTABLE,TABLE@}] - [FILE='@var{file_name'} [ENCODING='@var{encoding}']] + [FILE='@var{file_name}' [ENCODING='@var{encoding}']] [SKIP=@var{record_count}] /@var{var_spec}@dots{} @@ -572,18 +574,21 @@ For text files: /NAME='@var{file_name} [/MODE=CHARACTER] /TABWIDTH=@var{tab_width} + [ENCODING='@var{encoding}'] For binary files in native encoding with fixed-length records: FILE HANDLE @var{handle_name} /NAME='@var{file_name}' /MODE=IMAGE [/LRECL=@var{rec_len}] + [ENCODING='@var{encoding}'] For binary files in native encoding with variable-length records: FILE HANDLE @var{handle_name} /NAME='@var{file_name}' /MODE=BINARY [/LRECL=@var{rec_len}] + [ENCODING='@var{encoding}'] For binary files encoded in EBCDIC: FILE HANDLE @var{handle_name} @@ -591,6 +596,7 @@ For binary files encoded in EBCDIC: /MODE=360 /RECFORM=@{FIXED,VARIABLE,SPANNED@} [/LRECL=@var{rec_len}] + [ENCODING='@var{encoding}'] @end display Use @cmd{FILE HANDLE} to associate a file handle name with a file and @@ -726,6 +732,14 @@ The @subcmd{NAME} subcommand specifies the name of the file associated with the handle. It is required in all modes but SCRATCH mode, in which its use is forbidden. +The ENCODING subcommand specifies the encoding of text in the file. +For reading text files in CHARACTER mode, all of the forms described +for ENCODING on the INSERT command are supported (@pxref{INSERT}). +For reading in other file-based modes, encoding autodetection is not +supported; if the specified encoding requests autodetection then the +default encoding will be used. This is also true when a file handle +is used for writing a file in any mode. + @node INPUT PROGRAM @section INPUT PROGRAM @vindex INPUT PROGRAM diff --git a/src/data/file-handle-def.c b/src/data/file-handle-def.c index 6ca6977c87..9a46bfad43 100644 --- a/src/data/file-handle-def.c +++ b/src/data/file-handle-def.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,12 +26,13 @@ #include "data/dataset.h" #include "data/file-name.h" #include "data/variable.h" +#include "libpspp/cast.h" #include "libpspp/compiler.h" +#include "libpspp/hash-functions.h" #include "libpspp/hmap.h" #include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/str.h" -#include "libpspp/hash-functions.h" #include "gl/xalloc.h" @@ -50,11 +51,11 @@ struct file_handle /* FH_REF_FILE only. */ char *file_name; /* File name as provided by user. */ enum fh_mode mode; /* File mode. */ - const char *encoding; /* File encoding. */ /* FH_REF_FILE and FH_REF_INLINE only. */ size_t record_width; /* Length of fixed-format records. */ size_t tab_width; /* Tab width, 0=do not expand tabs. */ + char *encoding; /* Charset for contents. */ /* FH_REF_DATASET only. */ struct dataset *ds; /* Dataset. */ @@ -71,7 +72,8 @@ static struct file_handle *default_handle; static struct file_handle *inline_file; static struct file_handle *create_handle (const char *id, - char *name, enum fh_referent); + char *name, enum fh_referent, + const char *encoding); static void free_handle (struct file_handle *); static void unname_handle (struct file_handle *); @@ -82,7 +84,8 @@ static struct hmap locks = HMAP_INITIALIZER (locks); void fh_init (void) { - inline_file = create_handle ("INLINE", xstrdup ("INLINE"), FH_REF_INLINE); + inline_file = create_handle ("INLINE", xstrdup ("INLINE"), FH_REF_INLINE, + "Auto"); inline_file->record_width = 80; inline_file->tab_width = 8; } @@ -110,6 +113,7 @@ free_handle (struct file_handle *handle) free (handle->id); free (handle->name); free (handle->file_name); + free (handle->encoding); free (handle); } @@ -189,7 +193,8 @@ fh_from_id (const char *id) The new handle is not fully initialized. The caller is responsible for completing its initialization. */ static struct file_handle * -create_handle (const char *id, char *handle_name, enum fh_referent referent) +create_handle (const char *id, char *handle_name, enum fh_referent referent, + const char *encoding) { struct file_handle *handle = xzalloc (sizeof *handle); @@ -197,6 +202,7 @@ create_handle (const char *id, char *handle_name, enum fh_referent referent) handle->id = id != NULL ? xstrdup (id) : NULL; handle->name = handle_name; handle->referent = referent; + handle->encoding = xstrdup (encoding); if (id != NULL) { @@ -231,12 +237,11 @@ fh_create_file (const char *id, const char *file_name, struct file_handle *handle; handle_name = id != NULL ? xstrdup (id) : xasprintf ("`%s'", file_name); - handle = create_handle (id, handle_name, FH_REF_FILE); + handle = create_handle (id, handle_name, FH_REF_FILE, properties->encoding); handle->file_name = xstrdup (file_name); handle->mode = properties->mode; handle->record_width = properties->record_width; handle->tab_width = properties->tab_width; - handle->encoding = properties->encoding; return handle; } @@ -253,7 +258,7 @@ fh_create_dataset (struct dataset *ds) if (name[0] == '\0') name = _("active dataset"); - handle = create_handle (NULL, xstrdup (name), FH_REF_DATASET); + handle = create_handle (NULL, xstrdup (name), FH_REF_DATASET, C_ENCODING); handle->ds = ds; return handle; } @@ -263,7 +268,7 @@ const struct fh_properties * fh_default_properties (void) { static const struct fh_properties default_properties - = {FH_MODE_TEXT, 1024, 4, C_ENCODING}; + = {FH_MODE_TEXT, 1024, 4, (char *) "Auto"}; return &default_properties; } @@ -333,10 +338,9 @@ fh_get_tab_width (const struct file_handle *handle) /* Returns the encoding of characters read from HANDLE. */ const char * -fh_get_legacy_encoding (const struct file_handle *handle) +fh_get_encoding (const struct file_handle *handle) { - assert (handle->referent & (FH_REF_FILE | FH_REF_INLINE)); - return (handle->referent == FH_REF_FILE ? handle->encoding : C_ENCODING); + return handle->encoding; } /* Returns the dataset handle associated with HANDLE. diff --git a/src/data/file-handle-def.h b/src/data/file-handle-def.h index 11898ef578..9a60e72423 100644 --- a/src/data/file-handle-def.h +++ b/src/data/file-handle-def.h @@ -55,7 +55,7 @@ struct fh_properties enum fh_mode mode; /* File mode. */ size_t record_width; /* Length of fixed-format records. */ size_t tab_width; /* Tab width, 0=do not expand tabs. */ - const char *encoding; /* ASCII or EBCDIC? */ + char *encoding; /* Charset for contents. */ }; void fh_init (void); @@ -82,6 +82,7 @@ struct file_handle *fh_inline_file (void); const char *fh_get_id (const struct file_handle *); const char *fh_get_name (const struct file_handle *); enum fh_referent fh_get_referent (const struct file_handle *); +const char *fh_get_encoding (const struct file_handle *); /* Properties of FH_REF_FILE file handles. */ const char *fh_get_file_name (const struct file_handle *); @@ -90,7 +91,6 @@ enum fh_mode fh_get_mode (const struct file_handle *) ; /* Properties of FH_REF_FILE and FH_REF_INLINE file handles. */ size_t fh_get_record_width (const struct file_handle *); size_t fh_get_tab_width (const struct file_handle *); -const char *fh_get_legacy_encoding (const struct file_handle *); /* Properties of FH_REF_DATASET file handles. */ struct dataset *fh_get_dataset (const struct file_handle *); diff --git a/src/language/data-io/data-list.c b/src/language/data-io/data-list.c index f16c60651c..17c6032d25 100644 --- a/src/language/data-io/data-list.c +++ b/src/language/data-io/data-list.c @@ -78,7 +78,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds) struct dfm_reader *reader; struct variable *end = NULL; struct file_handle *fh = NULL; - struct string encoding = DS_EMPTY_INITIALIZER; + char *encoding = NULL; int table; enum data_parser_type type; @@ -111,7 +111,8 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds) if (!lex_force_string (lexer)) goto error; - ds_init_substring (&encoding, lex_tokss (lexer)); + free (encoding); + encoding = ss_xstrdup (lex_tokss (lexer)); lex_get (lexer); } @@ -241,7 +242,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds) } type = data_parser_get_type (parser); - if (! ds_is_empty (&encoding) && NULL == fh) + if (encoding && NULL == fh) msg (MW, _("Encoding should not be specified for inline data. It will be " "ignored.")); @@ -278,7 +279,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds) if (table) data_parser_output_description (parser, fh); - reader = dfm_open_reader (fh, lexer); + reader = dfm_open_reader (fh, lexer, encoding); if (reader == NULL) goto error; @@ -294,7 +295,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds) data_parser_make_active_file (parser, ds, reader, dict); fh_unref (fh); - ds_destroy (&encoding); + free (encoding); return CMD_SUCCESS; @@ -303,7 +304,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds) if (!in_input_program ()) dict_destroy (dict); fh_unref (fh); - ds_destroy (&encoding); + free (encoding); return CMD_CASCADING_FAILURE; } diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c index aea3bbd037..1dc7c93f77 100644 --- a/src/language/data-io/data-parser.c +++ b/src/language/data-io/data-parser.c @@ -527,7 +527,7 @@ static bool parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - const char *input_encoding = dfm_reader_get_legacy_encoding (reader); + const char *input_encoding = dfm_reader_get_encoding (reader); const char *output_encoding = dict_get_encoding (parser->dict); struct field *f; int row; @@ -579,7 +579,7 @@ static bool parse_delimited_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - const char *input_encoding = dfm_reader_get_legacy_encoding (reader); + const char *input_encoding = dfm_reader_get_encoding (reader); const char *output_encoding = dict_get_encoding (parser->dict); struct string tmp = DS_EMPTY_INITIALIZER; struct field *f; @@ -623,7 +623,7 @@ static bool parse_delimited_no_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - const char *input_encoding = dfm_reader_get_legacy_encoding (reader); + const char *input_encoding = dfm_reader_get_encoding (reader); const char *output_encoding = dict_get_encoding (parser->dict); struct string tmp = DS_EMPTY_INITIALIZER; struct substring s; diff --git a/src/language/data-io/data-reader.c b/src/language/data-io/data-reader.c index 0f96e589cc..ea95bc9832 100644 --- a/src/language/data-io/data-reader.c +++ b/src/language/data-io/data-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-2004, 2006, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,7 +34,9 @@ #include "language/lexer/lexer.h" #include "libpspp/assertion.h" #include "libpspp/cast.h" +#include "libpspp/encoding-guesser.h" #include "libpspp/integer-format.h" +#include "libpspp/line-reader.h" #include "libpspp/message.h" #include "libpspp/str.h" @@ -69,6 +71,10 @@ struct dfm_reader size_t pos; /* Offset in line of current character. */ unsigned eof_cnt; /* # of attempts to advance past EOF. */ struct lexer *lexer; /* The lexer reading the file */ + char *encoding; /* Current encoding. */ + + /* For FH_MODE_TEXT only. */ + struct line_reader *line_reader; /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */ size_t block_left; /* Bytes left in current block. */ @@ -101,19 +107,28 @@ dfm_close_reader (struct dfm_reader *r) } } + line_reader_free (r->line_reader); + free (r->encoding); fh_unref (r->fh); ds_destroy (&r->line); ds_destroy (&r->scratch); free (r); } -/* Opens the file designated by file handle FH for reading as a - data file. Providing fh_inline_file() for FH designates the - "inline file", that is, data included inline in the command - file between BEGIN FILE and END FILE. Returns a reader if - successful, or a null pointer otherwise. */ +/* Opens the file designated by file handle FH for reading as a data file. + Returns a reader if successful, or a null pointer otherwise. + + If FH is fh_inline_file() then the new reader reads data included inline in + the command file between BEGIN FILE and END FILE, obtaining data from LEXER. + LEXER must remain valid as long as the new reader is in use. ENCODING is + ignored. + + If FH is not fh_inline_file(), then the encoding of the file read is by + default that of FH itself. If ENCODING is nonnull, then it overrides the + default encoding. LEXER is ignored. */ struct dfm_reader * -dfm_open_reader (struct file_handle *fh, struct lexer *lexer) +dfm_open_reader (struct file_handle *fh, struct lexer *lexer, + const char *encoding) { struct dfm_reader *r; struct fh_lock *lock; @@ -147,10 +162,6 @@ dfm_open_reader (struct file_handle *fh, struct lexer *lexer) { msg (ME, _("Could not open `%s' for reading as a data file: %s."), fh_get_file_name (r->fh), strerror (errno)); - fh_unlock (r->lock); - fh_unref (fh); - free (r); - return NULL; } r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1; } @@ -158,14 +169,43 @@ dfm_open_reader (struct file_handle *fh, struct lexer *lexer) r->file_size = -1; fh_lock_set_aux (lock, r); + if (encoding == NULL) + encoding = fh_get_encoding (fh); + if (fh_get_referent (fh) == FH_REF_FILE && fh_get_mode (fh) == FH_MODE_TEXT) + { + r->line_reader = line_reader_for_fd (encoding, fileno (r->file)); + if (r->line_reader == NULL) + { + msg (ME, _("Could not read `%s' as a text file with encoding `%s': " + "%s."), + fh_get_file_name (r->fh), encoding, strerror (errno)); + goto error; + } + r->encoding = xstrdup (line_reader_get_encoding (r->line_reader)); + } + else + { + r->line_reader = NULL; + r->encoding = xstrdup (encoding_guess_parse_encoding (encoding)); + } + return r; + +error: + fh_unlock (r->lock); + fh_unref (fh); + free (r); + return NULL; } /* Returns true if an I/O error occurred on READER, false otherwise. */ bool dfm_reader_error (const struct dfm_reader *r) { - return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file); + return (fh_get_referent (r->fh) == FH_REF_FILE + && (r->line_reader != NULL + ? line_reader_error (r->line_reader) != 0 + : ferror (r->file))); } /* Reads a record from the inline file into R. @@ -211,17 +251,12 @@ read_inline_record (struct dfm_reader *r) return true; } -/* Report a read error or unexpected end-of-file condition on R. */ +/* Report a read error on R. */ static void read_error (struct dfm_reader *r) { - if (ferror (r->file)) - msg (ME, _("Error reading file %s: %s."), - fh_get_name (r->fh), strerror (errno)); - else if (feof (r->file)) - msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh)); - else - NOT_REACHED (); + msg (ME, _("Error reading file %s: %s."), + fh_get_name (r->fh), strerror (errno)); } /* Report a partial read at end of file reading R. */ @@ -333,6 +368,34 @@ read_size (struct dfm_reader *r, size_t *size_out) return 1; } +static bool +read_text_record (struct dfm_reader *r) +{ + bool is_auto; + bool ok; + + /* Read a line. If the line reader's encoding changes, update r->encoding to + match. */ + is_auto = line_reader_is_auto (r->line_reader); + ok = line_reader_read (r->line_reader, &r->line, SIZE_MAX); + if (is_auto && !line_reader_is_auto (r->line_reader)) + { + free (r->encoding); + r->encoding = xstrdup (line_reader_get_encoding (r->line_reader)); + } + + /* Detect and report read error. */ + if (!ok) + { + int error = line_reader_error (r->line_reader); + if (error != 0) + msg (ME, _("Error reading file %s: %s."), + fh_get_name (r->fh), strerror (error)); + } + + return ok; +} + /* Reads a record from a disk file into R. Returns true if successful, false on error or at end of file. */ static bool @@ -344,17 +407,7 @@ read_file_record (struct dfm_reader *r) switch (fh_get_mode (r->fh)) { case FH_MODE_TEXT: - if (ds_read_line (&r->line, r->file, SIZE_MAX)) - { - ds_chomp_byte (&r->line, '\n'); - return true; - } - else - { - if (ferror (r->file)) - read_error (r); - return false; - } + return read_text_record (r); case FH_MODE_FIXED: if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file)) @@ -597,11 +650,11 @@ dfm_expand_tabs (struct dfm_reader *r) r->pos = new_pos; } -/* Returns the legacy character encoding of data read from READER. */ +/* Returns the character encoding of data read from READER. */ const char * -dfm_reader_get_legacy_encoding (const struct dfm_reader *reader) +dfm_reader_get_encoding (const struct dfm_reader *reader) { - return fh_get_legacy_encoding (reader->fh); + return reader->encoding; } /* Returns a number between 0 and 100 that approximates the @@ -615,7 +668,11 @@ dfm_get_percent_read (const struct dfm_reader *reader) { if (reader->file_size >= 0) { - off_t position = ftello (reader->file); + off_t position; + + position = (reader->line_reader != NULL + ? line_reader_tell (reader->line_reader) + : ftello (reader->file)); if (position >= 0) { double p = 100.0 * position / reader->file_size; @@ -710,7 +767,7 @@ cmd_begin_data (struct lexer *lexer, struct dataset *ds) lex_match (lexer, T_ENDCMD); /* Open inline file. */ - r = dfm_open_reader (fh_inline_file (), lexer); + r = dfm_open_reader (fh_inline_file (), lexer, NULL); r->flags |= DFM_SAW_BEGIN_DATA; r->flags &= ~DFM_CONSUME; diff --git a/src/language/data-io/data-reader.h b/src/language/data-io/data-reader.h index affff78815..a199f015af 100644 --- a/src/language/data-io/data-reader.h +++ b/src/language/data-io/data-reader.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -31,13 +31,14 @@ struct string; struct lexer; /* Input. */ -struct dfm_reader *dfm_open_reader (struct file_handle *, struct lexer *); +struct dfm_reader *dfm_open_reader (struct file_handle *, struct lexer *, + const char *encoding); void dfm_close_reader (struct dfm_reader *); bool dfm_reader_error (const struct dfm_reader *); unsigned dfm_eof (struct dfm_reader *); struct substring dfm_get_record (struct dfm_reader *); void dfm_expand_tabs (struct dfm_reader *); -const char *dfm_reader_get_legacy_encoding (const struct dfm_reader *); +const char *dfm_reader_get_encoding (const struct dfm_reader *); int dfm_get_percent_read (const struct dfm_reader *); /* Line control. */ diff --git a/src/language/data-io/data-writer.c b/src/language/data-io/data-writer.c index 113be58805..5270db0e81 100644 --- a/src/language/data-io/data-writer.c +++ b/src/language/data-io/data-writer.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-2004, 2006, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -28,7 +28,9 @@ #include "data/make-file.h" #include "language/data-io/file-handle.h" #include "libpspp/assertion.h" +#include "libpspp/encoding-guesser.h" #include "libpspp/integer-format.h" +#include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/str.h" @@ -46,14 +48,31 @@ struct dfm_writer struct fh_lock *lock; /* Exclusive access to file. */ FILE *file; /* Associated file. */ struct replace_file *rf; /* Atomic file replacement support. */ + char *encoding; /* Encoding. */ + + int unit; /* Unit width, in bytes. */ + char lf[MAX_UNIT]; /* \n in encoding, 'unit' bytes long. */ + char spaces[32]; /* 32 bytes worth of ' ' in encoding. */ }; -/* Opens a file handle for writing as a data file. */ +/* Opens a file handle for writing as a data file. + + The encoding of the file written is by default that of FH itself. If + ENCODING is nonnull, then it overrides the default encoding. + + *However*: ENCODING directly affects only text strings written by the data + writer code itself, that is, new-lines in FH_MODE_TEXT and space padding in + FH_MODE_FIXED mode. The client must do its own encoding translation for the + data that it writes. (This is unavoidable because sometimes the data + written includes binary data that reencoding would mangle.) The client can + obtain the encoding to re-encode into with dfm_writer_get_encoding(). */ struct dfm_writer * -dfm_open_writer (struct file_handle *fh) +dfm_open_writer (struct file_handle *fh, const char *encoding) { + struct encoding_info ei; struct dfm_writer *w; struct fh_lock *lock; + int ofs; lock = fh_lock (fh, FH_REF_FILE, N_("data file"), FH_ACC_WRITE, false); if (lock == NULL) @@ -63,11 +82,22 @@ dfm_open_writer (struct file_handle *fh) if (w != NULL) return w; + encoding = encoding_guess_parse_encoding (encoding != NULL + ? encoding + : fh_get_encoding (fh)); + get_encoding_info (&ei, encoding); + w = xmalloc (sizeof *w); w->fh = fh_ref (fh); w->lock = lock; w->rf = replace_file_start (fh_get_file_name (w->fh), "wb", 0666, &w->file, NULL); + w->encoding = xstrdup (encoding); + w->unit = ei.unit; + memcpy (w->lf, ei.lf, sizeof w->lf); + for (ofs = 0; ofs + ei.unit <= sizeof w->spaces; ofs += ei.unit) + memcpy (&w->spaces[ofs], ei.space, ei.unit); + if (w->rf == NULL) { msg (ME, _("An error occurred while opening `%s' for writing " @@ -104,7 +134,7 @@ dfm_put_record (struct dfm_writer *w, const char *rec, size_t len) { case FH_MODE_TEXT: fwrite (rec, len, 1, w->file); - putc ('\n', w->file); + fwrite (w->lf, w->unit, 1, w->file); break; case FH_MODE_FIXED: @@ -115,9 +145,8 @@ dfm_put_record (struct dfm_writer *w, const char *rec, size_t len) fwrite (rec, write_bytes, 1, w->file); while (pad_bytes > 0) { - static const char spaces[32] = " "; - size_t chunk = MIN (pad_bytes, sizeof spaces); - fwrite (spaces, chunk, 1, w->file); + size_t chunk = MIN (pad_bytes, sizeof w->spaces); + fwrite (w->spaces, chunk, 1, w->file); pad_bytes -= chunk; } } @@ -193,14 +222,15 @@ dfm_close_writer (struct dfm_writer *w) ok = false; } fh_unref (w->fh); + free (w->encoding); free (w); return ok; } -/* Returns the legacy character encoding of data written to WRITER. */ +/* Returns the encoding of data written to WRITER. */ const char * -dfm_writer_get_legacy_encoding (const struct dfm_writer *writer) +dfm_writer_get_encoding (const struct dfm_writer *writer) { - return fh_get_legacy_encoding (writer->fh); + return writer->encoding; } diff --git a/src/language/data-io/data-writer.h b/src/language/data-io/data-writer.h index 045db3163f..10ad6cd656 100644 --- a/src/language/data-io/data-writer.h +++ b/src/language/data-io/data-writer.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,10 +23,11 @@ #include struct file_handle; -struct dfm_writer *dfm_open_writer (struct file_handle *); +struct dfm_writer *dfm_open_writer (struct file_handle *, + const char *encoding); bool dfm_close_writer (struct dfm_writer *); bool dfm_write_error (const struct dfm_writer *); bool dfm_put_record (struct dfm_writer *, const char *rec, size_t len); -const char *dfm_writer_get_legacy_encoding (const struct dfm_writer *); +const char *dfm_writer_get_encoding (const struct dfm_writer *); #endif /* data-writer.h */ diff --git a/src/language/data-io/file-handle.q b/src/language/data-io/file-handle.q index 0519803e7f..26dfc97e0c 100644 --- a/src/language/data-io/file-handle.q +++ b/src/language/data-io/file-handle.q @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,20 +16,22 @@ #include +#include "data/file-handle-def.h" + #include #include #include #include "data/file-name.h" #include "data/session.h" +#include "data/variable.h" #include "language/command.h" #include "language/data-io/file-handle.h" #include "language/lexer/lexer.h" #include "libpspp/assertion.h" +#include "libpspp/cast.h" #include "libpspp/message.h" #include "libpspp/str.h" -#include "data/variable.h" -#include "data/file-handle-def.h" #include "gl/xalloc.h" @@ -45,7 +47,8 @@ lrecl=integer; tabwidth=integer; mode=mode:!character/binary/image/360; - recform=recform:fixed/f/variable/v/spanned/vs. + recform=recform:fixed/f/variable/v/spanned/vs; + encoding=string. */ /* (declarations) */ /* (functions) */ @@ -109,7 +112,7 @@ cmd_file_handle (struct lexer *lexer, struct dataset *ds) properties.mode = FH_MODE_VARIABLE; break; case FH_360: - properties.encoding = "EBCDIC-US"; + properties.encoding = CONST_CAST (char *, "EBCDIC-US"); if (cmd.recform == FH_FIXED || cmd.recform == FH_F) properties.mode = FH_MODE_FIXED; else if (cmd.recform == FH_VARIABLE || cmd.recform == FH_V) @@ -146,6 +149,9 @@ cmd_file_handle (struct lexer *lexer, struct dataset *ds) properties.record_width = cmd.n_lrecl[0]; } + if (cmd.s_encoding != NULL) + properties.encoding = cmd.s_encoding; + fh_create_file (handle_name, cmd.s_name, &properties); result = CMD_SUCCESS; diff --git a/src/language/data-io/get-data.c b/src/language/data-io/get-data.c index 4274f959d2..10d59aa374 100644 --- a/src/language/data-io/get-data.c +++ b/src/language/data-io/get-data.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -307,6 +307,7 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds) struct dictionary *dict = dict_create (get_default_encoding ()); struct file_handle *fh = NULL; struct dfm_reader *reader = NULL; + char *encoding = NULL; char *name = NULL; int record; @@ -334,7 +335,18 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds) if (!lex_force_match (lexer, T_SLASH)) goto error; - if (lex_match_id (lexer, "ARRANGEMENT")) + if (lex_match_id (lexer, "ENCODING")) + { + lex_match (lexer, T_EQUALS); + if (!lex_force_string (lexer)) + goto error; + + free (encoding); + encoding = ss_xstrdup (lex_tokss (lexer)); + + lex_get (lexer); + } + else if (lex_match_id (lexer, "ARRANGEMENT")) { bool ok; @@ -606,12 +618,13 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds) } while (lex_token (lexer) != T_ENDCMD); - reader = dfm_open_reader (fh, lexer); + reader = dfm_open_reader (fh, lexer, encoding); if (reader == NULL) goto error; data_parser_make_active_file (parser, ds, reader, dict); fh_unref (fh); + free (encoding); return CMD_SUCCESS; error: @@ -619,6 +632,7 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds) dict_destroy (dict); fh_unref (fh); free (name); + free (encoding); return CMD_CASCADING_FAILURE; } diff --git a/src/language/data-io/inpt-pgm.c b/src/language/data-io/inpt-pgm.c index 6f2a99e038..36c58c8591 100644 --- a/src/language/data-io/inpt-pgm.c +++ b/src/language/data-io/inpt-pgm.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -266,6 +266,7 @@ cmd_reread (struct lexer *lexer, struct dataset *ds) struct file_handle *fh; /* File to be re-read. */ struct expression *e; /* Expression for column to set. */ struct reread_trns *t; /* Created transformation. */ + char *encoding = NULL; fh = fh_get_default_handle (); e = NULL; @@ -278,13 +279,12 @@ cmd_reread (struct lexer *lexer, struct dataset *ds) if (e) { lex_sbc_only_once ("COLUMN"); - expr_free (e); - return CMD_CASCADING_FAILURE; + goto error; } e = expr_parse (lexer, ds, EXPR_NUMBER); if (!e) - return CMD_CASCADING_FAILURE; + goto error; } else if (lex_match_id (lexer, "FILE")) { @@ -292,26 +292,39 @@ cmd_reread (struct lexer *lexer, struct dataset *ds) fh_unref (fh); fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL); if (fh == NULL) - { - expr_free (e); - return CMD_CASCADING_FAILURE; - } + goto error; + } + else if (lex_match_id (lexer, "ENCODING")) + { + lex_match (lexer, T_EQUALS); + if (!lex_force_string (lexer)) + goto error; + + free (encoding); + encoding = ss_xstrdup (lex_tokss (lexer)); + + lex_get (lexer); } else { lex_error (lexer, NULL); - expr_free (e); - return CMD_CASCADING_FAILURE; + goto error; } } t = xmalloc (sizeof *t); - t->reader = dfm_open_reader (fh, lexer); + t->reader = dfm_open_reader (fh, lexer, encoding); t->column = e; add_transformation (ds, reread_trns_proc, reread_trns_free, t); fh_unref (fh); + free (encoding); return CMD_SUCCESS; + +error: + expr_free (e); + free (encoding); + return CMD_CASCADING_FAILURE; } /* Executes a REREAD transformation. */ diff --git a/src/language/data-io/print-space.c b/src/language/data-io/print-space.c index edaf13e769..adeb92ba5b 100644 --- a/src/language/data-io/print-space.c +++ b/src/language/data-io/print-space.c @@ -51,6 +51,7 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds) struct file_handle *handle = NULL; struct expression *expr = NULL; struct dfm_writer *writer; + char *encoding = NULL; if (lex_match_id (lexer, "OUTFILE")) { @@ -59,6 +60,17 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds) handle = fh_parse (lexer, FH_REF_FILE, NULL); if (handle == NULL) return CMD_FAILURE; + + if (lex_match_id (lexer, "ENCODING")) + { + lex_match (lexer, T_EQUALS); + if (!lex_force_string (lexer)) + goto error; + + encoding = ss_xstrdup (lex_tokss (lexer)); + + lex_get (lexer); + } } else handle = NULL; @@ -77,7 +89,7 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds) if (handle != NULL) { - writer = dfm_open_writer (handle); + writer = dfm_open_writer (handle, encoding); if (writer == NULL) goto error; } @@ -124,7 +136,7 @@ print_space_trns_proc (void *t_, struct ccase **c, if (trns->writer == NULL) text_item_submit (text_item_create (TEXT_ITEM_BLANK_LINE, "")); else - dfm_put_record (trns->writer, " ", 1); + dfm_put_record (trns->writer, " ", 1); /* XXX */ if (trns->writer != NULL && dfm_write_error (trns->writer)) return TRNS_ERROR; diff --git a/src/language/data-io/print.c b/src/language/data-io/print.c index 86952e0efe..cffa3bd49f 100644 --- a/src/language/data-io/print.c +++ b/src/language/data-io/print.c @@ -136,6 +136,7 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds, bool print_table = 0; struct print_trns *trns; struct file_handle *fh = NULL; + char *encoding = NULL; struct pool *tmp_pool; /* Fill in prt to facilitate error-handling. */ @@ -160,6 +161,17 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds, if (fh == NULL) goto error; } + else if (lex_match_id (lexer, "ENCODING")) + { + lex_match (lexer, T_EQUALS); + if (!lex_force_string (lexer)) + goto error; + + free (encoding); + encoding = ss_xstrdup (lex_tokss (lexer)); + + lex_get (lexer); + } else if (lex_match_id (lexer, "RECORDS")) { lex_match (lexer, T_EQUALS); @@ -194,10 +206,10 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds, if (fh != NULL) { - trns->writer = dfm_open_writer (fh); + trns->writer = dfm_open_writer (fh, encoding); if (trns->writer == NULL) goto error; - trns->encoding = dfm_writer_get_legacy_encoding (trns->writer); + trns->encoding = dfm_writer_get_encoding (trns->writer); } else trns->encoding = UTF8; -- 2.30.2