For example, this allows a Swedish EBCDIC file that doesn't contain
any indication of its codepage to be read with "ENCODING='IBM278'".
14 files changed:
- HOST has been updated to use more modern syntax.
- HOST has been updated to use more modern syntax.
- - INCLUDE and INSERT have a new ENCODING subcommand.
+ - GET, INCLUDE, and INSERT have a new ENCODING subcommand.
- MISSING VALUES can now assign missing values to long string
variables.
- MISSING VALUES can now assign missing values to long string
variables.
/DROP=var_list
/KEEP=var_list
/RENAME=(src_names=target_names)@dots{}
/DROP=var_list
/KEEP=var_list
/RENAME=(src_names=target_names)@dots{}
@end display
@cmd{GET} clears the current dictionary and active dataset and
@end display
@cmd{GET} clears the current dictionary and active dataset and
file on disk. Only the active dataset read from the file
is affected by these subcommands.
file on disk. Only the active dataset read from the file
is affected by these subcommands.
+PSPP tries to automatically detect the encoding of string data in the
+file. Sometimes, however, this does not work well encoding,
+especially for files written by old versions of SPSS or PSPP. Specify
+the ENCODING subcommand with an IANA character set name as its string
+argument to override the default. The ENCODING subcommand is a PSPP
+extension.
+
@cmd{GET} does not cause the data to be read, only the dictionary. The data
is read later, when a procedure is executed.
@cmd{GET} does not cause the data to be read, only the dictionary. The data
is read later, when a procedure is executed.
fh_create_file (NULL, name, fh_default_properties () );
sri = xmalloc (sizeof (*sri));
fh_create_file (NULL, name, fh_default_properties () );
sri = xmalloc (sizeof (*sri));
- sri->reader = sfm_open_reader (fh, &sri->dict, &sri->opts);
+ sri->reader = sfm_open_reader (fh, NULL, &sri->dict, &sri->opts);
if ( NULL == sri->reader)
{
if ( NULL == sri->reader)
{
/* PSPP - a program for statistical analysis.
/* PSPP - a program for statistical analysis.
- Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
/* Returns a casereader for HANDLE. On success, returns the new
casereader and stores the file's dictionary into *DICT. On
/* Returns a casereader for HANDLE. On success, returns the new
casereader and stores the file's dictionary into *DICT. On
- failure, returns a null pointer. */
+ failure, returns a null pointer.
+
+ Ordinarily the reader attempts to automatically detect the character
+ encoding based on the file's contents. This isn't always possible,
+ especially for files written by old versions of SPSS or PSPP, so specifying
+ a nonnull ENCODING overrides the choice of character encoding. */
-any_reader_open (struct file_handle *handle, struct dictionary **dict)
+any_reader_open (struct file_handle *handle, const char *encoding,
+ struct dictionary **dict)
{
switch (fh_get_referent (handle))
{
{
switch (fh_get_referent (handle))
{
if (result == IO_ERROR)
return NULL;
else if (result == YES)
if (result == IO_ERROR)
return NULL;
else if (result == YES)
- return sfm_open_reader (handle, dict, NULL);
+ return sfm_open_reader (handle, encoding, dict, NULL);
result = try_detect (fh_get_file_name (handle), pfm_detect);
if (result == IO_ERROR)
result = try_detect (fh_get_file_name (handle), pfm_detect);
if (result == IO_ERROR)
/* PSPP - a program for statistical analysis.
/* PSPP - a program for statistical analysis.
- Copyright (C) 2006, 2010 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2010, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
struct file_handle;
struct dictionary;
bool any_reader_may_open (const char *file_name);
struct file_handle;
struct dictionary;
bool any_reader_may_open (const char *file_name);
-struct casereader *any_reader_open (struct file_handle *,
+struct casereader *any_reader_open (struct file_handle *, const char *encoding,
struct dictionary **);
#endif /* any-reader.h */
struct dictionary **);
#endif /* any-reader.h */
/* Opens the system file designated by file handle FH for reading. Reads the
system file's dictionary into *DICT.
/* Opens the system file designated by file handle FH for reading. Reads the
system file's dictionary into *DICT.
+ Ordinarily the reader attempts to automatically detect the character
+ encoding based on the file's contents. This isn't always possible,
+ especially for files written by old versions of SPSS or PSPP, so specifying
+ a nonnull ENCODING overrides the choice of character encoding.
+
If INFO is non-null, then it receives additional info about the system file,
which the caller must eventually free with sfm_read_info_destroy() when it
is no longer needed. */
struct casereader *
If INFO is non-null, then it receives additional info about the system file,
which the caller must eventually free with sfm_read_info_destroy() when it
is no longer needed. */
struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
- struct sfm_read_info *infop)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+ struct dictionary **dictp, struct sfm_read_info *infop)
{
struct sfm_reader *volatile r = NULL;
struct sfm_read_info info;
{
struct sfm_reader *volatile r = NULL;
struct sfm_read_info info;
First, figure out the correct character encoding, because this determines
how the rest of the header data is to be interpreted. */
First, figure out the correct character encoding, because this determines
how the rest of the header data is to be interpreted. */
- dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER],
- extensions[EXT_ENCODING]));
+ dict = dict_create (encoding
+ ? encoding
+ : choose_encoding (r, &header, extensions[EXT_INTEGER],
+ extensions[EXT_ENCODING]));
r->encoding = dict_get_encoding (dict);
/* These records don't use variables at all. */
r->encoding = dict_get_encoding (dict);
/* These records don't use variables at all. */
struct dictionary;
struct file_handle;
struct dictionary;
struct file_handle;
-struct casereader *sfm_open_reader (struct file_handle *,
+struct casereader *sfm_open_reader (struct file_handle *, const char *encoding,
struct dictionary **,
struct sfm_read_info *);
bool sfm_detect (FILE *);
struct dictionary **,
struct sfm_read_info *);
bool sfm_detect (FILE *);
bool compress; /* 1=compressed, 0=not compressed. */
casenumber case_cnt; /* Number of cases written so far. */
bool compress; /* 1=compressed, 0=not compressed. */
casenumber case_cnt; /* Number of cases written so far. */
+ uint8_t space; /* ' ' in the file's character encoding. */
/* Compression buffering.
/* Compression buffering.
sfm_open_writer (struct file_handle *fh, struct dictionary *d,
struct sfm_write_options opts)
{
sfm_open_writer (struct file_handle *fh, struct dictionary *d,
struct sfm_write_options opts)
{
+ struct encoding_info encoding_info;
struct sfm_writer *w;
mode_t mode;
int i;
struct sfm_writer *w;
mode_t mode;
int i;
+ get_encoding_info (&encoding_info, dict_get_encoding (d));
+ w->space = encoding_info.space[0];
+
/* Write the file header. */
write_header (w, d);
/* Write the file header. */
write_header (w, d);
size_t n_mrsets;
size_t i;
size_t n_mrsets;
size_t i;
+ if (is_encoding_ebcdic_compatible (encoding))
+ {
+ /* FIXME. */
+ return;
+ }
+
n_mrsets = dict_get_n_mrsets (dict);
if (n_mrsets == 0)
return;
n_mrsets = dict_get_n_mrsets (dict);
if (n_mrsets == 0)
return;
assert (w->data_cnt < 8);
assert (size <= 8);
assert (w->data_cnt < 8);
assert (size <= 8);
- memset (w->data[w->data_cnt], ' ', 8);
+ memset (w->data[w->data_cnt], w->space, 8);
memcpy (w->data[w->data_cnt], data, size);
w->data_cnt++;
}
memcpy (w->data[w->data_cnt], data, size);
w->data_cnt++;
}
size_t pad_bytes = width - data_bytes;
write_bytes (w, string, data_bytes);
while (pad_bytes-- > 0)
size_t pad_bytes = width - data_bytes;
write_bytes (w, string, data_bytes);
while (pad_bytes-- > 0)
+ putc (w->space, w->file);
}
/* Recodes null-terminated UTF-8 encoded STRING into ENCODING, and writes the
}
/* Recodes null-terminated UTF-8 encoded STRING into ENCODING, and writes the
write_spaces (struct sfm_writer *w, size_t n)
{
while (n-- > 0)
write_spaces (struct sfm_writer *w, size_t n)
{
while (n-- > 0)
+ putc (w->space, w->file);
/* PSPP - a program for statistical analysis.
/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
if (file->handle == NULL)
goto error;
if (file->handle == NULL)
goto error;
- file->reader = any_reader_open (file->handle, &file->dict);
+ file->reader = any_reader_open (file->handle, NULL, &file->dict);
if (file->reader == NULL)
goto error;
}
if (file->reader == NULL)
goto error;
}
/* PSPP - a program for statistical analysis.
/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
struct file_handle *fh = NULL;
struct dictionary *dict = NULL;
struct case_map *map = NULL;
struct file_handle *fh = NULL;
struct dictionary *dict = NULL;
struct case_map *map = NULL;
if (fh == NULL)
goto error;
}
if (fh == NULL)
goto error;
}
+ else if (command == GET_CMD && lex_match_id (lexer, "ENCODING"))
+ {
+ lex_match (lexer, T_EQUALS);
+
+ if (!lex_force_string (lexer))
+ goto error;
+
+ free (encoding);
+ encoding = ss_xstrdup (lex_tokss (lexer));
+
+ lex_get (lexer);
+ }
else if (command == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
{
lex_match (lexer, T_EQUALS);
else if (command == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
{
lex_match (lexer, T_EQUALS);
- reader = any_reader_open (fh, &dict);
+ reader = any_reader_open (fh, encoding, &dict);
if (reader == NULL)
goto error;
if (reader == NULL)
goto error;
dataset_set_source (ds, reader);
fh_unref (fh);
dataset_set_source (ds, reader);
fh_unref (fh);
return CMD_SUCCESS;
error:
return CMD_SUCCESS;
error:
casereader_destroy (reader);
if (dict != NULL)
dict_destroy (dict);
casereader_destroy (reader);
if (dict != NULL)
dict_destroy (dict);
return CMD_CASCADING_FAILURE;
}
return CMD_CASCADING_FAILURE;
}
/* PSPP - a program for statistical analysis.
/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
if (!handle)
return CMD_FAILURE;
handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
if (!handle)
return CMD_FAILURE;
- reader = any_reader_open (handle, &dict);
+ reader = any_reader_open (handle, NULL, &dict);
fh_unref (handle);
if (dict == NULL)
return CMD_FAILURE;
fh_unref (handle);
if (dict == NULL)
return CMD_FAILURE;
if (!h)
return CMD_FAILURE;
if (!h)
return CMD_FAILURE;
- reader = sfm_open_reader (h, &d, &info);
+ reader = sfm_open_reader (h, NULL, &d, &info);
if (!reader)
{
fh_unref (h);
if (!reader)
{
fh_unref (h);
"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
"abcdefghijklmnopqrstuvwxyz{|}~");
"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
"abcdefghijklmnopqrstuvwxyz{|}~");
- struct substring out, cr, lf;
+ struct substring out, cr, lf, space;
bool ok;
memset (e, 0, sizeof *e);
cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
bool ok;
memset (e, 0, sizeof *e);
cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
- ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
+ space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
+ ok = (cr.length >= 1
+ && cr.length <= MAX_UNIT
+ && cr.length == lf.length
+ && cr.length == space.length);
if (!ok)
{
fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
ss_dealloc (&cr);
ss_dealloc (&lf);
if (!ok)
{
fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
ss_dealloc (&cr);
ss_dealloc (&lf);
ss_alloc_substring (&cr, ss_cstr ("\r"));
ss_alloc_substring (&lf, ss_cstr ("\n"));
ss_alloc_substring (&cr, ss_cstr ("\r"));
ss_alloc_substring (&lf, ss_cstr ("\n"));
+ ss_alloc_substring (&space, ss_cstr (" "));
}
e->unit = cr.length;
memcpy (e->cr, cr.string, e->unit);
memcpy (e->lf, lf.string, e->unit);
}
e->unit = cr.length;
memcpy (e->cr, cr.string, e->unit);
memcpy (e->lf, lf.string, e->unit);
+ memcpy (e->space, space.string, e->unit);
ss_dealloc (&cr);
ss_dealloc (&lf);
ss_dealloc (&cr);
ss_dealloc (&lf);
out = recode_substring_pool ("UTF-8", name, in, NULL);
e->is_ascii_compatible = ss_equals (in, out);
out = recode_substring_pool ("UTF-8", name, in, NULL);
e->is_ascii_compatible = ss_equals (in, out);
int unit; /* Unit width, in bytes. */
char cr[MAX_UNIT]; /* \r in encoding, 'unit' bytes long. */
char lf[MAX_UNIT]; /* \n in encoding, 'unit' bytes long. */
int unit; /* Unit width, in bytes. */
char cr[MAX_UNIT]; /* \r in encoding, 'unit' bytes long. */
char lf[MAX_UNIT]; /* \n in encoding, 'unit' bytes long. */
+ char space[MAX_UNIT]; /* ' ' in encoding, 'unit' bytes long. */
};
bool get_encoding_info (struct encoding_info *, const char *name);
};
bool get_encoding_info (struct encoding_info *, const char *name);