Also, warn when opening a system file that does not identify its encoding.
* REGRESSION now recognises /STATISTICS=CI(x) which causes confidence
intervals for the coefficients to be printed.
- * The SYSFILE INFO command now accepts an ENCODING subcommand to
- specify the character encoding of string data in the system file.
-
* PSPPIRE graphical user interface improvements:
- File|Open now allows an encoding to be selected for opening
- File|Display Data File Information|External File... now allows an
encoding to be selected.
- * System files that contain duplicate variable names may now be read
- successfully (bug #41475).
+ * System file related improvements:
+
+ - With ENCODING="DETECT", SYSFILE INFO can now help the user to
+ pick an encoding for reading a system file that does not identify
+ its own encoding
+
+ - SYSFILE INFO now accepts an ENCODING subcommand to specify the
+ character encoding of string data in the system file.
+
+ - System files that contain duplicate variable names may now be
+ read successfully (bug #41475).
Changes from 0.8.1 to 0.8.2:
file. Sometimes, however, this does not work well,
especially for files written by old versions of SPSS or @pspp{}. Specify
the @subcmd{ENCODING} subcommand with an @acronym{IANA} character set name as its string
-argument to override the default. The @subcmd{ENCODING} subcommand is a @pspp{}
-extension.
+argument to override the default. Use @cmd{SYSFILE INFO} to analyze
+the encodings that might be valid for a system file. The
+@subcmd{ENCODING} subcommand is a @pspp{} extension.
@cmd{GET} does not cause the data to be read, only the dictionary. The data
is read later, when a procedure is executed.
the file. Sometimes, however, this does not work well, especially for
files written by old versions of SPSS or @pspp{}. Specify the
@subcmd{ENCODING} subcommand with an @acronym{IANA} character set name
-as its string argument to override the default. The @subcmd{ENCODING}
-subcommand is a @pspp{} extension.
+as its string argument to override the default, or specify
+@code{ENCODING='DETECT'} to analyze and report possibly valid
+encodings for the system file. The @subcmd{ENCODING} subcommand is a
+@pspp{} extension.
@cmd{SYSFILE INFO} does not affect the current active dataset.
{
off_t pos;
int width;
- char name[8];
+ char name[9];
int print_format;
int write_format;
int missing_value_code;
return NULL;
}
+struct get_strings_aux
+ {
+ struct pool *pool;
+ char **titles;
+ char **strings;
+ bool *ids;
+ size_t allocated;
+ size_t n;
+ };
+
+static void
+add_string__ (struct get_strings_aux *aux,
+ const char *string, bool id, char *title)
+{
+ if (aux->n >= aux->allocated)
+ {
+ aux->allocated = 2 * (aux->allocated + 1);
+ aux->titles = pool_realloc (aux->pool, aux->titles,
+ aux->allocated * sizeof *aux->titles);
+ aux->strings = pool_realloc (aux->pool, aux->strings,
+ aux->allocated * sizeof *aux->strings);
+ aux->ids = pool_realloc (aux->pool, aux->ids,
+ aux->allocated * sizeof *aux->ids);
+ }
+
+ aux->titles[aux->n] = title;
+ aux->strings[aux->n] = pool_strdup (aux->pool, string);
+ aux->ids[aux->n] = id;
+ aux->n++;
+}
+
+static void PRINTF_FORMAT (3, 4)
+add_string (struct get_strings_aux *aux,
+ const char *string, const char *title, ...)
+{
+ va_list args;
+
+ va_start (args, title);
+ add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
+ va_end (args);
+}
+
+static void PRINTF_FORMAT (3, 4)
+add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
+{
+ va_list args;
+
+ va_start (args, title);
+ add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
+ va_end (args);
+}
+
+/* Retrieves significant string data from R in its raw format, to allow the
+ caller to try to detect the encoding in use.
+
+ Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
+ and *STRINGSP to an array of N elements allocated from POOL. For each I in
+ 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
+ whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
+ be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
+ text. */
+size_t
+sfm_get_strings (const struct sfm_reader *r, struct pool *pool,
+ char ***titlesp, bool **idsp, char ***stringsp)
+{
+ const struct sfm_mrset *mrset;
+ struct get_strings_aux aux;
+ size_t var_idx;
+ size_t i, j, k;
+
+ aux.pool = pool;
+ aux.titles = NULL;
+ aux.strings = NULL;
+ aux.ids = NULL;
+ aux.allocated = 0;
+ aux.n = 0;
+
+ var_idx = 0;
+ for (i = 0; i < r->n_vars; i++)
+ if (r->vars[i].width != -1)
+ add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
+
+ var_idx = 0;
+ for (i = 0; i < r->n_vars; i++)
+ if (r->vars[i].width != -1)
+ {
+ var_idx++;
+ if (r->vars[i].label)
+ add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
+ var_idx);
+ }
+
+ k = 0;
+ for (i = 0; i < r->n_labels; i++)
+ for (j = 0; j < r->labels[i].n_labels; j++)
+ add_string (&aux, r->labels[i].labels[j].label,
+ _("Value Label %zu"), k++);
+
+ add_string (&aux, r->header.creation_date, _("Creation Date"));
+ add_string (&aux, r->header.creation_time, _("Creation Time"));
+ add_string (&aux, r->header.eye_catcher, _("Product"));
+ add_string (&aux, r->header.file_label, _("File Label"));
+
+ if (r->extensions[EXT_PRODUCT_INFO])
+ add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
+ _("Extra Product Info"));
+
+ if (r->document)
+ {
+ size_t i;
+
+ for (i = 0; i < r->document->n_lines; i++)
+ {
+ char line[81];
+
+ memcpy (line, r->document->documents + i * 80, 80);
+ line[80] = '\0';
+
+ add_string (&aux, line, _("Document Line %zu"), i + 1);
+ }
+ }
+
+ for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
+ {
+ size_t mrset_idx = mrset - r->mrsets + 1;
+
+ add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
+ if (mrset->label[0])
+ add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
+
+ /* Skip the variables because they ought to be duplicates. */
+
+ if (mrset->counted)
+ add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
+ mrset_idx);
+ }
+
+ /* */
+ /* data file attributes */
+ /* variable attributes */
+ /* long var map */
+ /* long string value labels */
+ /* long string missing values */
+
+ *titlesp = aux.titles;
+ *idsp = aux.ids;
+ *stringsp = aux.strings;
+ return aux.n;
+}
+
/* Decodes the dictionary read from R, saving it into into *DICT. Character
strings in R are decoded using ENCODING, or an encoding obtained from R if
ENCODING is null, or the locale encoding if R specifies no encoding.
{
encoding = sfm_get_encoding (r);
if (encoding == NULL)
- encoding = locale_charset ();
+ {
+ sys_warn (r, -1, _("This system file does not indicate its own "
+ "character encoding. Using default encoding "
+ "%s. For best results, specify an encoding "
+ "explicitly. Use SYSFILE INFO with "
+ "ENCODING=\"DETECT\" to analyze the possible "
+ "encodings."),
+ locale_charset ());
+ encoding = locale_charset ();
+ }
}
dict = dict_create (encoding);
|| !read_int (r, &record->missing_value_code)
|| !read_int (r, &record->print_format)
|| !read_int (r, &record->write_format)
- || !read_bytes (r, record->name, sizeof record->name))
+ || !read_string (r, record->name, sizeof record->name))
return false;
if (has_variable_label == 1)
size_t i;
name = recode_string_pool ("UTF-8", dict_encoding,
- rec->name, 8, r->pool);
+ rec->name, -1, r->pool);
name[strcspn (name, " ")] = '\0';
if (!dict_id_is_valid (dict, name, false)
/* Obtaining information about an sfm_reader before . */
const char *sfm_get_encoding (const struct sfm_reader *);
+size_t sfm_get_strings (const struct sfm_reader *, struct pool *pool,
+ char ***labels, bool **ids, char ***values);
/* Decoding a system file's dictionary and obtaining a casereader. */
struct casereader *sfm_decode (struct sfm_reader *, const char *encoding,
#include <config.h>
#include <ctype.h>
+#include <errno.h>
#include <float.h>
#include <stdlib.h>
#include "language/lexer/lexer.h"
#include "language/lexer/variable-parser.h"
#include "libpspp/array.h"
+#include "libpspp/hash-functions.h"
+#include "libpspp/i18n.h"
#include "libpspp/message.h"
#include "libpspp/misc.h"
+#include "libpspp/pool.h"
#include "libpspp/string-array.h"
#include "output/tab.h"
+#include "output/text-item.h"
+#include "gl/localcharset.h"
#include "gl/minmax.h"
#include "gl/xalloc.h"
static int describe_variable (const struct variable *v, struct tab_table *t,
int r, int pc, int flags);
+static void report_encodings (const struct file_handle *,
+ const struct sfm_reader *);
+
/* SYSFILE INFO utility. */
int
cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED)
if (sfm_reader == NULL)
goto error;
+ if (encoding && !strcasecmp (encoding, "detect"))
+ {
+ report_encodings (h, sfm_reader);
+ fh_unref (h);
+ return CMD_SUCCESS;
+ }
+
reader = sfm_decode (sfm_reader, encoding, &d, &info);
if (!reader)
goto error;
: info.compression == SFM_COMP_SIMPLE ? "SAV"
: "ZSAV");
- tab_text (t, 0, r, TAB_LEFT, _("Charset:"));
+ tab_text (t, 0, r, TAB_LEFT, _("Encoding:"));
tab_text (t, 1, r++, TAB_LEFT, dict_get_encoding (d));
tab_submit (t);
free (vl);
}
+\f
+/* Encoding analysis. */
+
+/* This list of encodings is taken from http://encoding.spec.whatwg.org/, as
+ retrieved February 2014. Encodings not supported by glibc and encodings
+ relevant only to HTML have been removed. */
+static const char *encoding_names[] = {
+ "utf-8",
+ "windows-1252",
+ "iso-8859-2",
+ "iso-8859-3",
+ "iso-8859-4",
+ "iso-8859-5",
+ "iso-8859-6",
+ "iso-8859-7",
+ "iso-8859-8",
+ "iso-8859-10",
+ "iso-8859-13",
+ "iso-8859-14",
+ "iso-8859-16",
+ "macintosh",
+ "windows-874",
+ "windows-1250",
+ "windows-1251",
+ "windows-1253",
+ "windows-1254",
+ "windows-1255",
+ "windows-1256",
+ "windows-1257",
+ "windows-1258",
+ "koi8-r",
+ "koi8-u",
+ "ibm866",
+ "gb18030",
+ "big5",
+ "euc-jp",
+ "iso-2022-jp",
+ "shift_jis",
+ "euc-kr",
+};
+#define N_ENCODING_NAMES (sizeof encoding_names / sizeof *encoding_names)
+
+struct encoding
+ {
+ uint64_t encodings;
+ char **utf8_strings;
+ unsigned int hash;
+ };
+
+static char **
+recode_strings (struct pool *pool,
+ char **strings, bool *ids, size_t n,
+ const char *encoding)
+{
+ char **utf8_strings;
+ size_t i;
+
+ utf8_strings = pool_alloc (pool, n * sizeof *utf8_strings);
+ for (i = 0; i < n; i++)
+ {
+ struct substring utf8;
+ int error;
+
+ error = recode_pedantically ("UTF-8", encoding, ss_cstr (strings[i]),
+ pool, &utf8);
+ if (!error)
+ {
+ ss_rtrim (&utf8, ss_cstr (" "));
+ utf8.string[utf8.length] = '\0';
+
+ if (ids[i] && !id_is_plausible (utf8.string, false))
+ error = EINVAL;
+ }
+
+ if (error)
+ return NULL;
+
+ utf8_strings[i] = utf8.string;
+ }
+
+ return utf8_strings;
+}
+
+static struct encoding *
+find_duplicate_encoding (struct encoding *encodings, size_t n_encodings,
+ char **utf8_strings, size_t n_strings,
+ unsigned int hash)
+{
+ struct encoding *e;
+
+ for (e = encodings; e < &encodings[n_encodings]; e++)
+ {
+ int i;
+
+ if (e->hash != hash)
+ goto next_encoding;
+
+ for (i = 0; i < n_strings; i++)
+ if (strcmp (utf8_strings[i], e->utf8_strings[i]))
+ goto next_encoding;
+
+ return e;
+ next_encoding:;
+ }
+
+ return NULL;
+}
+
+static bool
+all_equal (const struct encoding *encodings, size_t n_encodings,
+ size_t string_idx)
+{
+ const char *s0;
+ size_t i;
+
+ s0 = encodings[0].utf8_strings[string_idx];
+ for (i = 1; i < n_encodings; i++)
+ if (strcmp (s0, encodings[i].utf8_strings[string_idx]))
+ return false;
+
+ return true;
+}
+
+static int
+equal_prefix (const struct encoding *encodings, size_t n_encodings,
+ size_t string_idx)
+{
+ const char *s0;
+ size_t prefix;
+ size_t i;
+
+ s0 = encodings[0].utf8_strings[string_idx];
+ prefix = strlen (s0);
+ for (i = 1; i < n_encodings; i++)
+ {
+ const char *si = encodings[i].utf8_strings[string_idx];
+ size_t j;
+
+ for (j = 0; j < prefix; j++)
+ if (s0[j] != si[j])
+ {
+ prefix = j;
+ if (!prefix)
+ return 0;
+ break;
+ }
+ }
+
+ while (prefix > 0 && s0[prefix - 1] != ' ')
+ prefix--;
+ return prefix;
+}
+
+static int
+equal_suffix (const struct encoding *encodings, size_t n_encodings,
+ size_t string_idx)
+{
+ const char *s0;
+ size_t s0_len;
+ size_t suffix;
+ size_t i;
+
+ s0 = encodings[0].utf8_strings[string_idx];
+ s0_len = strlen (s0);
+ suffix = s0_len;
+ for (i = 1; i < n_encodings; i++)
+ {
+ const char *si = encodings[i].utf8_strings[string_idx];
+ size_t si_len = strlen (si);
+ size_t j;
+
+ if (si_len < suffix)
+ suffix = si_len;
+ for (j = 0; j < suffix; j++)
+ if (s0[s0_len - j - 1] != si[si_len - j - 1])
+ {
+ suffix = j;
+ if (!suffix)
+ return 0;
+ break;
+ }
+ }
+
+ while (suffix > 0 && s0[s0_len - suffix] != ' ')
+ suffix--;
+ return suffix;
+}
+
+static void
+report_encodings (const struct file_handle *h, const struct sfm_reader *r)
+{
+ char **titles;
+ char **strings;
+ bool *ids;
+ struct encoding encodings[N_ENCODING_NAMES];
+ size_t n_encodings, n_strings, n_unique_strings;
+ size_t i, j;
+ struct tab_table *t;
+ struct text_item *text;
+ struct pool *pool;
+ size_t row;
+
+ pool = pool_create ();
+ n_strings = sfm_get_strings (r, pool, &titles, &ids, &strings);
+
+ n_encodings = 0;
+ for (i = 0; i < N_ENCODING_NAMES; i++)
+ {
+ char **utf8_strings;
+ struct encoding *e;
+ unsigned int hash;
+
+ utf8_strings = recode_strings (pool, strings, ids, n_strings,
+ encoding_names[i]);
+ if (!utf8_strings)
+ continue;
+
+ /* Hash utf8_strings. */
+ hash = 0;
+ for (j = 0; j < n_strings; j++)
+ hash = hash_string (utf8_strings[j], hash);
+
+ /* If there's a duplicate encoding, just mark it. */
+ e = find_duplicate_encoding (encodings, n_encodings,
+ utf8_strings, n_strings, hash);
+ if (e)
+ {
+ e->encodings |= UINT64_C (1) << i;
+ continue;
+ }
+
+ e = &encodings[n_encodings++];
+ e->encodings = UINT64_C (1) << i;
+ e->utf8_strings = utf8_strings;
+ e->hash = hash;
+ }
+ if (!n_encodings)
+ {
+ msg (SW, _("No valid encodings found."));
+ pool_destroy (pool);
+ return;
+ }
+
+ text = text_item_create_format (
+ TEXT_ITEM_PARAGRAPH,
+ _("The following table lists the encodings that can successfully read %s, "
+ "by specifying the encoding name on the GET command's ENCODING "
+ "subcommand. Encodings that yield identical text are listed "
+ "together."), fh_get_name (h));
+ text_item_submit (text);
+
+ t = tab_create (2, n_encodings + 1);
+ tab_title (t, _("Usable encodings for %s."), fh_get_name (h));
+ tab_headers (t, 1, 0, 1, 0);
+ tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, 1, n_encodings);
+ tab_hline (t, TAL_1, 0, 1, 1);
+ tab_text (t, 0, 0, TAB_RIGHT, "#");
+ tab_text (t, 1, 0, TAB_LEFT, _("Encodings"));
+ for (i = 0; i < n_encodings; i++)
+ {
+ struct string s;
+
+ ds_init_empty (&s);
+ for (j = 0; j < 64; j++)
+ if (encodings[i].encodings & (UINT64_C (1) << j))
+ ds_put_format (&s, "%s, ", encoding_names[j]);
+ ds_chomp (&s, ss_cstr (", "));
+
+ tab_text_format (t, 0, i + 1, TAB_RIGHT, "%d", i + 1);
+ tab_text (t, 1, i + 1, TAB_LEFT, ds_cstr (&s));
+ ds_destroy (&s);
+ }
+ tab_submit (t);
+
+ n_unique_strings = 0;
+ for (i = 0; i < n_strings; i++)
+ if (!all_equal (encodings, n_encodings, i))
+ n_unique_strings++;
+ if (!n_unique_strings)
+ {
+ pool_destroy (pool);
+ return;
+ }
+
+ text = text_item_create_format (
+ TEXT_ITEM_PARAGRAPH,
+ _("The following table lists text strings in the file dictionary that "
+ "the encodings above interpret differently, along with those "
+ "interpretations."));
+ text_item_submit (text);
+
+ t = tab_create (3, (n_encodings * n_unique_strings) + 1);
+ tab_title (t, _("%s encoded text strings."), fh_get_name (h));
+ tab_headers (t, 1, 0, 1, 0);
+ tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, 2, n_encodings * n_unique_strings);
+ tab_hline (t, TAL_1, 0, 2, 1);
+
+ tab_text (t, 0, 0, TAB_LEFT, _("Purpose"));
+ tab_text (t, 1, 0, TAB_RIGHT, "#");
+ tab_text (t, 2, 0, TAB_LEFT, _("Text"));
+
+ row = 1;
+ for (i = 0; i < n_strings; i++)
+ if (!all_equal (encodings, n_encodings, i))
+ {
+ int prefix = equal_prefix (encodings, n_encodings, i);
+ int suffix = equal_suffix (encodings, n_encodings, i);
+
+ tab_joint_text (t, 0, row, 0, row + n_encodings - 1,
+ TAB_LEFT, titles[i]);
+ tab_hline (t, TAL_1, 0, 2, row);
+ for (j = 0; j < n_encodings; j++)
+ {
+ const char *s = encodings[j].utf8_strings[i] + prefix;
+
+ tab_text_format (t, 1, row, TAB_RIGHT, "%d", j + 1);
+ if (prefix || suffix)
+ {
+ size_t len = strlen (s) - suffix;
+ struct string entry;
+
+ ds_init_empty (&entry);
+ if (prefix)
+ ds_put_cstr (&entry, "...");
+ ds_put_substring (&entry, ss_buffer (s, len));
+ if (suffix)
+ ds_put_cstr (&entry, "...");
+ tab_text (t, 2, row, TAB_LEFT, ds_cstr (&entry));
+ }
+ else
+ tab_text (t, 2, row, TAB_LEFT, s);
+ row++;
+ }
+ }
+ tab_submit (t);
+
+ pool_destroy (pool);
+}
Returns the output length if successful, -1 if the output buffer is too
small. */
static ssize_t
-try_recode (iconv_t conv,
+try_recode (iconv_t conv, char fallbackchar,
const char *in, size_t inbytes,
char *out_, size_t outbytes)
{
- /* FIXME: Need to ensure that this char is valid in the target encoding */
- const char fallbackchar = '?';
char *out = out_;
int i;
{
case EINVAL:
if (outbytes < 2)
- return -1;
+ return -E2BIG;
+ if (!fallbackchar)
+ return -EINVAL;
*out++ = fallbackchar;
*out = '\0';
return out - out_;
case EILSEQ:
if (outbytes == 0)
- return -1;
+ return -E2BIG;
+ if (!fallbackchar)
+ return -EILSEQ;
*out++ = fallbackchar;
outbytes--;
if (inp)
break;
case E2BIG:
- return -1;
+ return -E2BIG;
default:
/* should never happen */
}
if (outbytes == 0)
- return -1;
+ return -E2BIG;
*out = '\0';
return out - out_;
return recode_string ("UTF-8", filename_encoding (), filename, -1);
}
+static int
+recode_substring_pool__ (const char *to, const char *from,
+ struct substring text, char fallbackchar,
+ struct pool *pool, struct substring *out)
+{
+ size_t bufsize;
+ iconv_t conv ;
+
+ if (to == NULL)
+ to = default_encoding;
+
+ if (from == NULL)
+ from = default_encoding;
+
+ conv = create_iconv (to, from);
+
+ if ( (iconv_t) -1 == conv )
+ {
+ if (fallbackchar)
+ {
+ out->string = pool_malloc (pool, text.length + 1);
+ out->length = text.length;
+ memcpy (out->string, text.string, text.length);
+ out->string[out->length] = '\0';
+ return 0;
+ }
+ else
+ return EPROTO;
+ }
+
+ for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
+ {
+ char *output = pool_malloc (pool, bufsize);
+ ssize_t retval;
+
+ retval = try_recode (conv, fallbackchar, text.string, text.length,
+ output, bufsize);
+ if (retval >= 0)
+ {
+ *out = ss_buffer (output, retval);
+ return 0;
+ }
+ pool_free (pool, output);
+
+ if (retval != -E2BIG)
+ return -retval;
+ }
+
+ NOT_REACHED ();
+}
+
/* Converts the string TEXT, which should be encoded in FROM-encoding, to a
dynamically allocated string in TO-encoding. Any characters which cannot be
converted will be represented by '?'.
recode_substring_pool (const char *to, const char *from,
struct substring text, struct pool *pool)
{
- size_t outbufferlength;
- iconv_t conv ;
-
- if (to == NULL)
- to = default_encoding;
-
- if (from == NULL)
- from = default_encoding;
-
- conv = create_iconv (to, from);
+ struct substring out;
- if ( (iconv_t) -1 == conv )
- {
- struct substring out;
+ recode_substring_pool__ (to, from, text, '?', pool, &out);
+ return out;
+}
- out.string = pool_malloc (pool, text.length + 1);
- out.length = text.length;
- memcpy (out.string, text.string, text.length);
- out.string[out.length] = '\0';
- return out;
- }
+/* Converts the string TEXT, which should be encoded in FROM-encoding, to a
+ dynamically allocated string in TO-encoding. On success, returns 0, and the
+ converted null-terminated string, allocated from POOL with pool_malloc(), is
+ stored in *OUT. On failure, returns a positive errno value.
- for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
- if ( outbufferlength > text.length)
- {
- char *output = pool_malloc (pool, outbufferlength);
- ssize_t output_len = try_recode (conv, text.string, text.length,
- output, outbufferlength);
- if (output_len >= 0)
- return ss_buffer (output, output_len);
- pool_free (pool, output);
- }
+ The function fails with an error if any part of the input string is not
+ valid in the declared input encoding. */
+int
+recode_pedantically (const char *to, const char *from,
+ struct substring text, struct pool *pool,
+ struct substring *out)
+{
+ int error;
- NOT_REACHED ();
+ error = recode_substring_pool__ (to, from, text, 0, pool, out);
+ if (error)
+ *out = ss_empty ();
+ return error;
}
-
+\f
void
i18n_init (void)
{
/* PSPP - a program for statistical analysis.
- Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
const char *text, int length, struct pool *);
struct substring recode_substring_pool (const char *to, const char *from,
struct substring text, struct pool *);
+int recode_pedantically (const char *to, const char *from,
+ struct substring text, struct pool *,
+ struct substring *out);
size_t recode_string_len (const char *to, const char *from,
const char *text, int len);
\f
AT_BANNER([system file reader - negative])
+AT_SETUP([unspecified character encoding])
+AT_KEYWORDS([sack synthetic system file positive])
+AT_DATA([sys-file.sack], [dnl
+dnl File header.
+"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
+2; dnl Layout code
+4; dnl Nominal case size
+0; dnl Not compressed
+0; dnl Not weighted
+0; dnl No cases.
+100.0; dnl Bias.
+"01 Jan 11"; "20:53:52";
+"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
+i8 0 *3;
+
+dnl Numeric variables.
+2; 0; 0; 0; 0x050800 *2; s8 "A";
+2; 0; 0; 0; 0x050800 *2; s8 "B";
+2; 0; 0; 0; 0x050800 *2; s8 "C";
+2; 0; 0; 0; 0x050800 *2; s8 "D";
+
+dnl Dictionary termination record.
+999; 0;
+])
+for variant in be le; do
+ AT_CHECK([sack --$variant sys-file.sack > sys-file.sav])
+ AT_DATA([sys-file.sps], [dnl
+GET 'sys-file.sav'.
+])
+ AT_CHECK([pspp -O format=csv sys-file.sps], [0], [stdout])
+ AT_CHECK([sed 's/default encoding.*For/default encoding. For/' stdout], [0], [dnl
+"warning: `sys-file.sav': This system file does not indicate its own character encoding. Using default encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=""DETECT"" to analyze the possible encodings."
+])
+done
+AT_CLEANUP
+
AT_SETUP([misplaced type 4 record])
AT_KEYWORDS([sack synthetic system file negative])
AT_DATA([sys-file.sack], [dnl
AT_SETUP([read very long strings written by SPSS 13])
AT_CHECK([cp $top_srcdir/tests/data/v13.sav .])
AT_DATA([sys-file.sps], [dnl
-GET FILE='v13.sav'.
+GET FILE='v13.sav' ENCODING='utf-8'.
DISPLAY VARIABLES.
LIST.
])
AT_SETUP([read very long strings written by SPSS 14])
AT_CHECK([cp $top_srcdir/tests/data/v14.sav .])
AT_DATA([sys-file.sps], [dnl
-GET FILE='v14.sav'.
+GET FILE='v14.sav' ENCODING='utf-8'.
DISPLAY VARIABLES.
LIST.
])
-e '/^Endian:,/d' \
-e '/^Integer Format:,/d' \
-e '/^Real Format:,/d' \
- -e '/^Charset:,/d' pspp.csv],
+ -e '/^Encoding:,/d' pspp.csv],
[0], [dnl
Table: Reading free-form data from INLINE.
Variable,Format