From: Ben Pfaff Date: Wed, 8 Feb 2012 06:58:09 +0000 (-0800) Subject: GET: Add an ENCODING subcommand. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=21559edd9991628d96df331e5b391ca6bade3497 GET: Add an ENCODING subcommand. For example, this allows a Swedish EBCDIC file that doesn't contain any indication of its codepage to be read with "ENCODING='IBM278'". --- diff --git a/NEWS b/NEWS index 916c7d4848..aa8c0df021 100644 --- a/NEWS +++ b/NEWS @@ -40,7 +40,7 @@ Changes from 0.6.2 to 0.7.9: - HOST has been updated to use more modern syntax. - - INCLUDE and INSERT have a new ENCODING subcommand. + - GET, INCLUDE, and INSERT have a new ENCODING subcommand. - MISSING VALUES can now assign missing values to long string variables. diff --git a/doc/files.texi b/doc/files.texi index cdce0a3c46..86aecab52c 100644 --- a/doc/files.texi +++ b/doc/files.texi @@ -139,6 +139,7 @@ GET /DROP=var_list /KEEP=var_list /RENAME=(src_names=target_names)@dots{} + /ENCODING='encoding' @end display @cmd{GET} clears the current dictionary and active dataset and @@ -171,6 +172,13 @@ Each may be present any number of times. @cmd{GET} never modifies a file on disk. Only the active dataset read from the file is affected by these subcommands. +PSPP tries to automatically detect the encoding of string data in the +file. Sometimes, however, this does not work well encoding, +especially for files written by old versions of SPSS or PSPP. Specify +the ENCODING subcommand with an IANA character set name as its string +argument to override the default. The ENCODING subcommand is a PSPP +extension. + @cmd{GET} does not cause the data to be read, only the dictionary. The data is read later, when a procedure is executed. diff --git a/perl-module/PSPP.xs b/perl-module/PSPP.xs index f6afa29b10..834ec401f4 100644 --- a/perl-module/PSPP.xs +++ b/perl-module/PSPP.xs @@ -709,7 +709,7 @@ CODE: fh_create_file (NULL, name, fh_default_properties () ); sri = xmalloc (sizeof (*sri)); - sri->reader = sfm_open_reader (fh, &sri->dict, &sri->opts); + sri->reader = sfm_open_reader (fh, NULL, &sri->dict, &sri->opts); if ( NULL == sri->reader) { diff --git a/src/data/any-reader.c b/src/data/any-reader.c index 50feb6892e..1b488f208a 100644 --- a/src/data/any-reader.c +++ b/src/data/any-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -80,9 +80,15 @@ any_reader_may_open (const char *file) /* Returns a casereader for HANDLE. On success, returns the new casereader and stores the file's dictionary into *DICT. On - failure, returns a null pointer. */ + failure, returns a null pointer. + + Ordinarily the reader attempts to automatically detect the character + encoding based on the file's contents. This isn't always possible, + especially for files written by old versions of SPSS or PSPP, so specifying + a nonnull ENCODING overrides the choice of character encoding. */ struct casereader * -any_reader_open (struct file_handle *handle, struct dictionary **dict) +any_reader_open (struct file_handle *handle, const char *encoding, + struct dictionary **dict) { switch (fh_get_referent (handle)) { @@ -94,7 +100,7 @@ any_reader_open (struct file_handle *handle, struct dictionary **dict) if (result == IO_ERROR) return NULL; else if (result == YES) - return sfm_open_reader (handle, dict, NULL); + return sfm_open_reader (handle, encoding, dict, NULL); result = try_detect (fh_get_file_name (handle), pfm_detect); if (result == IO_ERROR) diff --git a/src/data/any-reader.h b/src/data/any-reader.h index e999aa33f5..fb36e99cce 100644 --- a/src/data/any-reader.h +++ b/src/data/any-reader.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010 Free Software Foundation, Inc. + Copyright (C) 2006, 2010, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,7 +22,7 @@ struct file_handle; struct dictionary; bool any_reader_may_open (const char *file_name); -struct casereader *any_reader_open (struct file_handle *, +struct casereader *any_reader_open (struct file_handle *, const char *encoding, struct dictionary **); #endif /* any-reader.h */ diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 024b4ae182..7e8bcf0de3 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -312,12 +312,17 @@ sfm_read_info_destroy (struct sfm_read_info *info) /* Opens the system file designated by file handle FH for reading. Reads the system file's dictionary into *DICT. + Ordinarily the reader attempts to automatically detect the character + encoding based on the file's contents. This isn't always possible, + especially for files written by old versions of SPSS or PSPP, so specifying + a nonnull ENCODING overrides the choice of character encoding. + If INFO is non-null, then it receives additional info about the system file, which the caller must eventually free with sfm_read_info_destroy() when it is no longer needed. */ struct casereader * -sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, - struct sfm_read_info *infop) +sfm_open_reader (struct file_handle *fh, const char *volatile encoding, + struct dictionary **dictp, struct sfm_read_info *infop) { struct sfm_reader *volatile r = NULL; struct sfm_read_info info; @@ -454,8 +459,10 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, First, figure out the correct character encoding, because this determines how the rest of the header data is to be interpreted. */ - dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER], - extensions[EXT_ENCODING])); + dict = dict_create (encoding + ? encoding + : choose_encoding (r, &header, extensions[EXT_INTEGER], + extensions[EXT_ENCODING])); r->encoding = dict_get_encoding (dict); /* These records don't use variables at all. */ diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h index be01277235..a8f16e10db 100644 --- a/src/data/sys-file-reader.h +++ b/src/data/sys-file-reader.h @@ -52,7 +52,7 @@ void sfm_read_info_destroy (struct sfm_read_info *); struct dictionary; struct file_handle; -struct casereader *sfm_open_reader (struct file_handle *, +struct casereader *sfm_open_reader (struct file_handle *, const char *encoding, struct dictionary **, struct sfm_read_info *); bool sfm_detect (FILE *); diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index cf7bc70b22..5003ca2b89 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -74,6 +74,7 @@ struct sfm_writer bool compress; /* 1=compressed, 0=not compressed. */ casenumber case_cnt; /* Number of cases written so far. */ + uint8_t space; /* ' ' in the file's character encoding. */ /* Compression buffering. @@ -176,6 +177,7 @@ struct casewriter * sfm_open_writer (struct file_handle *fh, struct dictionary *d, struct sfm_write_options opts) { + struct encoding_info encoding_info; struct sfm_writer *w; mode_t mode; int i; @@ -227,6 +229,9 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, goto error; } + get_encoding_info (&encoding_info, dict_get_encoding (d)); + w->space = encoding_info.space[0]; + /* Write the file header. */ write_header (w, d); @@ -712,6 +717,12 @@ write_mrsets (struct sfm_writer *w, const struct dictionary *dict, size_t n_mrsets; size_t i; + if (is_encoding_ebcdic_compatible (encoding)) + { + /* FIXME. */ + return; + } + n_mrsets = dict_get_n_mrsets (dict); if (n_mrsets == 0) return; @@ -1251,7 +1262,7 @@ put_cmp_string (struct sfm_writer *w, const void *data, size_t size) assert (w->data_cnt < 8); assert (size <= 8); - memset (w->data[w->data_cnt], ' ', 8); + memset (w->data[w->data_cnt], w->space, 8); memcpy (w->data[w->data_cnt], data, size); w->data_cnt++; } @@ -1313,7 +1324,7 @@ write_string (struct sfm_writer *w, const char *string, size_t width) size_t pad_bytes = width - data_bytes; write_bytes (w, string, data_bytes); while (pad_bytes-- > 0) - putc (' ', w->file); + putc (w->space, w->file); } /* Recodes null-terminated UTF-8 encoded STRING into ENCODING, and writes the @@ -1374,5 +1385,5 @@ static void write_spaces (struct sfm_writer *w, size_t n) { while (n-- > 0) - putc (' ', w->file); + putc (w->space, w->file); } diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c index f306cad6f6..21736da8c6 100644 --- a/src/language/data-io/combine-files.c +++ b/src/language/data-io/combine-files.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -227,7 +227,7 @@ combine_files (enum comb_command_type command, if (file->handle == NULL) goto error; - file->reader = any_reader_open (file->handle, &file->dict); + file->reader = any_reader_open (file->handle, NULL, &file->dict); if (file->reader == NULL) goto error; } diff --git a/src/language/data-io/get.c b/src/language/data-io/get.c index d32f25567a..35b894a750 100644 --- a/src/language/data-io/get.c +++ b/src/language/data-io/get.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -73,6 +73,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds, struct file_handle *fh = NULL; struct dictionary *dict = NULL; struct case_map *map = NULL; + char *encoding = NULL; for (;;) { @@ -87,6 +88,18 @@ parse_read_command (struct lexer *lexer, struct dataset *ds, if (fh == NULL) goto error; } + else if (command == GET_CMD && lex_match_id (lexer, "ENCODING")) + { + lex_match (lexer, T_EQUALS); + + if (!lex_force_string (lexer)) + goto error; + + free (encoding); + encoding = ss_xstrdup (lex_tokss (lexer)); + + lex_get (lexer); + } else if (command == IMPORT_CMD && lex_match_id (lexer, "TYPE")) { lex_match (lexer, T_EQUALS); @@ -108,7 +121,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds, goto error; } - reader = any_reader_open (fh, &dict); + reader = any_reader_open (fh, encoding, &dict); if (reader == NULL) goto error; @@ -130,6 +143,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds, dataset_set_source (ds, reader); fh_unref (fh); + free (encoding); return CMD_SUCCESS; error: @@ -137,5 +151,6 @@ parse_read_command (struct lexer *lexer, struct dataset *ds, casereader_destroy (reader); if (dict != NULL) dict_destroy (dict); + free (encoding); return CMD_CASCADING_FAILURE; } diff --git a/src/language/dictionary/apply-dictionary.c b/src/language/dictionary/apply-dictionary.c index c2de9318ae..8531ba56d2 100644 --- a/src/language/dictionary/apply-dictionary.c +++ b/src/language/dictionary/apply-dictionary.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -53,7 +53,7 @@ cmd_apply_dictionary (struct lexer *lexer, struct dataset *ds) handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds)); if (!handle) return CMD_FAILURE; - reader = any_reader_open (handle, &dict); + reader = any_reader_open (handle, NULL, &dict); fh_unref (handle); if (dict == NULL) return CMD_FAILURE; diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c index bb990c79d9..31a685aa82 100644 --- a/src/language/dictionary/sys-file-info.c +++ b/src/language/dictionary/sys-file-info.c @@ -81,7 +81,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) if (!h) return CMD_FAILURE; - reader = sfm_open_reader (h, &d, &info); + reader = sfm_open_reader (h, NULL, &d, &info); if (!reader) { fh_unref (h); diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 149ad6fb2f..9658866056 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -694,29 +694,37 @@ get_encoding_info (struct encoding_info *e, const char *name) "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" "abcdefghijklmnopqrstuvwxyz{|}~"); - struct substring out, cr, lf; + struct substring out, cr, lf, space; bool ok; memset (e, 0, sizeof *e); cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL); lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL); - ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length; + space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL); + ok = (cr.length >= 1 + && cr.length <= MAX_UNIT + && cr.length == lf.length + && cr.length == space.length); if (!ok) { fprintf (stderr, "warning: encoding `%s' is not supported.\n", name); ss_dealloc (&cr); ss_dealloc (&lf); + ss_dealloc (&space); ss_alloc_substring (&cr, ss_cstr ("\r")); ss_alloc_substring (&lf, ss_cstr ("\n")); + ss_alloc_substring (&space, ss_cstr (" ")); } e->unit = cr.length; memcpy (e->cr, cr.string, e->unit); memcpy (e->lf, lf.string, e->unit); + memcpy (e->space, space.string, e->unit); ss_dealloc (&cr); ss_dealloc (&lf); + ss_dealloc (&space); out = recode_substring_pool ("UTF-8", name, in, NULL); e->is_ascii_compatible = ss_equals (in, out); diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index 27ccce361e..383ff12da5 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -134,6 +134,7 @@ struct encoding_info int unit; /* Unit width, in bytes. */ char cr[MAX_UNIT]; /* \r in encoding, 'unit' bytes long. */ char lf[MAX_UNIT]; /* \n in encoding, 'unit' bytes long. */ + char space[MAX_UNIT]; /* ' ' in encoding, 'unit' bytes long. */ }; bool get_encoding_info (struct encoding_info *, const char *name);