- HOST has been updated to use more modern syntax.
- - INCLUDE and INSERT have a new ENCODING subcommand.
+ - GET, INCLUDE, and INSERT have a new ENCODING subcommand.
- MISSING VALUES can now assign missing values to long string
variables.
/DROP=var_list
/KEEP=var_list
/RENAME=(src_names=target_names)@dots{}
+ /ENCODING='encoding'
@end display
@cmd{GET} clears the current dictionary and active dataset and
file on disk. Only the active dataset read from the file
is affected by these subcommands.
+PSPP tries to automatically detect the encoding of string data in the
+file. Sometimes, however, this does not work well encoding,
+especially for files written by old versions of SPSS or PSPP. Specify
+the ENCODING subcommand with an IANA character set name as its string
+argument to override the default. The ENCODING subcommand is a PSPP
+extension.
+
@cmd{GET} does not cause the data to be read, only the dictionary. The data
is read later, when a procedure is executed.
fh_create_file (NULL, name, fh_default_properties () );
sri = xmalloc (sizeof (*sri));
- sri->reader = sfm_open_reader (fh, &sri->dict, &sri->opts);
+ sri->reader = sfm_open_reader (fh, NULL, &sri->dict, &sri->opts);
if ( NULL == sri->reader)
{
/* PSPP - a program for statistical analysis.
- Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
/* Returns a casereader for HANDLE. On success, returns the new
casereader and stores the file's dictionary into *DICT. On
- failure, returns a null pointer. */
+ failure, returns a null pointer.
+
+ Ordinarily the reader attempts to automatically detect the character
+ encoding based on the file's contents. This isn't always possible,
+ especially for files written by old versions of SPSS or PSPP, so specifying
+ a nonnull ENCODING overrides the choice of character encoding. */
struct casereader *
-any_reader_open (struct file_handle *handle, struct dictionary **dict)
+any_reader_open (struct file_handle *handle, const char *encoding,
+ struct dictionary **dict)
{
switch (fh_get_referent (handle))
{
if (result == IO_ERROR)
return NULL;
else if (result == YES)
- return sfm_open_reader (handle, dict, NULL);
+ return sfm_open_reader (handle, encoding, dict, NULL);
result = try_detect (fh_get_file_name (handle), pfm_detect);
if (result == IO_ERROR)
/* PSPP - a program for statistical analysis.
- Copyright (C) 2006, 2010 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2010, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
struct file_handle;
struct dictionary;
bool any_reader_may_open (const char *file_name);
-struct casereader *any_reader_open (struct file_handle *,
+struct casereader *any_reader_open (struct file_handle *, const char *encoding,
struct dictionary **);
#endif /* any-reader.h */
/* Opens the system file designated by file handle FH for reading. Reads the
system file's dictionary into *DICT.
+ Ordinarily the reader attempts to automatically detect the character
+ encoding based on the file's contents. This isn't always possible,
+ especially for files written by old versions of SPSS or PSPP, so specifying
+ a nonnull ENCODING overrides the choice of character encoding.
+
If INFO is non-null, then it receives additional info about the system file,
which the caller must eventually free with sfm_read_info_destroy() when it
is no longer needed. */
struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
- struct sfm_read_info *infop)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+ struct dictionary **dictp, struct sfm_read_info *infop)
{
struct sfm_reader *volatile r = NULL;
struct sfm_read_info info;
First, figure out the correct character encoding, because this determines
how the rest of the header data is to be interpreted. */
- dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER],
- extensions[EXT_ENCODING]));
+ dict = dict_create (encoding
+ ? encoding
+ : choose_encoding (r, &header, extensions[EXT_INTEGER],
+ extensions[EXT_ENCODING]));
r->encoding = dict_get_encoding (dict);
/* These records don't use variables at all. */
struct dictionary;
struct file_handle;
-struct casereader *sfm_open_reader (struct file_handle *,
+struct casereader *sfm_open_reader (struct file_handle *, const char *encoding,
struct dictionary **,
struct sfm_read_info *);
bool sfm_detect (FILE *);
bool compress; /* 1=compressed, 0=not compressed. */
casenumber case_cnt; /* Number of cases written so far. */
+ uint8_t space; /* ' ' in the file's character encoding. */
/* Compression buffering.
sfm_open_writer (struct file_handle *fh, struct dictionary *d,
struct sfm_write_options opts)
{
+ struct encoding_info encoding_info;
struct sfm_writer *w;
mode_t mode;
int i;
goto error;
}
+ get_encoding_info (&encoding_info, dict_get_encoding (d));
+ w->space = encoding_info.space[0];
+
/* Write the file header. */
write_header (w, d);
size_t n_mrsets;
size_t i;
+ if (is_encoding_ebcdic_compatible (encoding))
+ {
+ /* FIXME. */
+ return;
+ }
+
n_mrsets = dict_get_n_mrsets (dict);
if (n_mrsets == 0)
return;
assert (w->data_cnt < 8);
assert (size <= 8);
- memset (w->data[w->data_cnt], ' ', 8);
+ memset (w->data[w->data_cnt], w->space, 8);
memcpy (w->data[w->data_cnt], data, size);
w->data_cnt++;
}
size_t pad_bytes = width - data_bytes;
write_bytes (w, string, data_bytes);
while (pad_bytes-- > 0)
- putc (' ', w->file);
+ putc (w->space, w->file);
}
/* Recodes null-terminated UTF-8 encoded STRING into ENCODING, and writes the
write_spaces (struct sfm_writer *w, size_t n)
{
while (n-- > 0)
- putc (' ', w->file);
+ putc (w->space, w->file);
}
/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
if (file->handle == NULL)
goto error;
- file->reader = any_reader_open (file->handle, &file->dict);
+ file->reader = any_reader_open (file->handle, NULL, &file->dict);
if (file->reader == NULL)
goto error;
}
/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
struct file_handle *fh = NULL;
struct dictionary *dict = NULL;
struct case_map *map = NULL;
+ char *encoding = NULL;
for (;;)
{
if (fh == NULL)
goto error;
}
+ else if (command == GET_CMD && lex_match_id (lexer, "ENCODING"))
+ {
+ lex_match (lexer, T_EQUALS);
+
+ if (!lex_force_string (lexer))
+ goto error;
+
+ free (encoding);
+ encoding = ss_xstrdup (lex_tokss (lexer));
+
+ lex_get (lexer);
+ }
else if (command == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
{
lex_match (lexer, T_EQUALS);
goto error;
}
- reader = any_reader_open (fh, &dict);
+ reader = any_reader_open (fh, encoding, &dict);
if (reader == NULL)
goto error;
dataset_set_source (ds, reader);
fh_unref (fh);
+ free (encoding);
return CMD_SUCCESS;
error:
casereader_destroy (reader);
if (dict != NULL)
dict_destroy (dict);
+ free (encoding);
return CMD_CASCADING_FAILURE;
}
/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
if (!handle)
return CMD_FAILURE;
- reader = any_reader_open (handle, &dict);
+ reader = any_reader_open (handle, NULL, &dict);
fh_unref (handle);
if (dict == NULL)
return CMD_FAILURE;
if (!h)
return CMD_FAILURE;
- reader = sfm_open_reader (h, &d, &info);
+ reader = sfm_open_reader (h, NULL, &d, &info);
if (!reader)
{
fh_unref (h);
"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
"abcdefghijklmnopqrstuvwxyz{|}~");
- struct substring out, cr, lf;
+ struct substring out, cr, lf, space;
bool ok;
memset (e, 0, sizeof *e);
cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
- ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
+ space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
+ ok = (cr.length >= 1
+ && cr.length <= MAX_UNIT
+ && cr.length == lf.length
+ && cr.length == space.length);
if (!ok)
{
fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
ss_dealloc (&cr);
ss_dealloc (&lf);
+ ss_dealloc (&space);
ss_alloc_substring (&cr, ss_cstr ("\r"));
ss_alloc_substring (&lf, ss_cstr ("\n"));
+ ss_alloc_substring (&space, ss_cstr (" "));
}
e->unit = cr.length;
memcpy (e->cr, cr.string, e->unit);
memcpy (e->lf, lf.string, e->unit);
+ memcpy (e->space, space.string, e->unit);
ss_dealloc (&cr);
ss_dealloc (&lf);
+ ss_dealloc (&space);
out = recode_substring_pool ("UTF-8", name, in, NULL);
e->is_ascii_compatible = ss_equals (in, out);
int unit; /* Unit width, in bytes. */
char cr[MAX_UNIT]; /* \r in encoding, 'unit' bytes long. */
char lf[MAX_UNIT]; /* \n in encoding, 'unit' bytes long. */
+ char space[MAX_UNIT]; /* ' ' in encoding, 'unit' bytes long. */
};
bool get_encoding_info (struct encoding_info *, const char *name);