From: Ben Pfaff Date: Sun, 16 Feb 2014 22:59:54 +0000 (-0800) Subject: Make SYSFILE INFO able to analyze valid encodings for a system file. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e461291a6df7145e7a870a83f2f10b5839845898;p=pspp Make SYSFILE INFO able to analyze valid encodings for a system file. Also, warn when opening a system file that does not identify its encoding. --- diff --git a/NEWS b/NEWS index 57de7877b2..51dc42cd1c 100644 --- a/NEWS +++ b/NEWS @@ -9,9 +9,6 @@ Changes since 0.8.2: * REGRESSION now recognises /STATISTICS=CI(x) which causes confidence intervals for the coefficients to be printed. - * The SYSFILE INFO command now accepts an ENCODING subcommand to - specify the character encoding of string data in the system file. - * PSPPIRE graphical user interface improvements: - File|Open now allows an encoding to be selected for opening @@ -20,8 +17,17 @@ Changes since 0.8.2: - File|Display Data File Information|External File... now allows an encoding to be selected. - * System files that contain duplicate variable names may now be read - successfully (bug #41475). + * System file related improvements: + + - With ENCODING="DETECT", SYSFILE INFO can now help the user to + pick an encoding for reading a system file that does not identify + its own encoding + + - SYSFILE INFO now accepts an ENCODING subcommand to specify the + character encoding of string data in the system file. + + - System files that contain duplicate variable names may now be + read successfully (bug #41475). Changes from 0.8.1 to 0.8.2: diff --git a/doc/files.texi b/doc/files.texi index 318c887265..889af5d2a9 100644 --- a/doc/files.texi +++ b/doc/files.texi @@ -179,8 +179,9 @@ is affected by these subcommands. file. Sometimes, however, this does not work well, especially for files written by old versions of SPSS or @pspp{}. Specify the @subcmd{ENCODING} subcommand with an @acronym{IANA} character set name as its string -argument to override the default. The @subcmd{ENCODING} subcommand is a @pspp{} -extension. +argument to override the default. Use @cmd{SYSFILE INFO} to analyze +the encodings that might be valid for a system file. The +@subcmd{ENCODING} subcommand is a @pspp{} extension. @cmd{GET} does not cause the data to be read, only the dictionary. The data is read later, when a procedure is executed. @@ -923,8 +924,10 @@ a system file and displays information on its dictionary. the file. Sometimes, however, this does not work well, especially for files written by old versions of SPSS or @pspp{}. Specify the @subcmd{ENCODING} subcommand with an @acronym{IANA} character set name -as its string argument to override the default. The @subcmd{ENCODING} -subcommand is a @pspp{} extension. +as its string argument to override the default, or specify +@code{ENCODING='DETECT'} to analyze and report possibly valid +encodings for the system file. The @subcmd{ENCODING} subcommand is a +@pspp{} extension. @cmd{SYSFILE INFO} does not affect the current active dataset. diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 9fab76f89d..a93f75fa77 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -110,7 +110,7 @@ struct sfm_var_record { off_t pos; int width; - char name[8]; + char name[9]; int print_format; int write_format; int missing_value_code; @@ -567,6 +567,156 @@ sfm_get_encoding (const struct sfm_reader *r) return NULL; } +struct get_strings_aux + { + struct pool *pool; + char **titles; + char **strings; + bool *ids; + size_t allocated; + size_t n; + }; + +static void +add_string__ (struct get_strings_aux *aux, + const char *string, bool id, char *title) +{ + if (aux->n >= aux->allocated) + { + aux->allocated = 2 * (aux->allocated + 1); + aux->titles = pool_realloc (aux->pool, aux->titles, + aux->allocated * sizeof *aux->titles); + aux->strings = pool_realloc (aux->pool, aux->strings, + aux->allocated * sizeof *aux->strings); + aux->ids = pool_realloc (aux->pool, aux->ids, + aux->allocated * sizeof *aux->ids); + } + + aux->titles[aux->n] = title; + aux->strings[aux->n] = pool_strdup (aux->pool, string); + aux->ids[aux->n] = id; + aux->n++; +} + +static void PRINTF_FORMAT (3, 4) +add_string (struct get_strings_aux *aux, + const char *string, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +static void PRINTF_FORMAT (3, 4) +add_id (struct get_strings_aux *aux, const char *id, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +/* Retrieves significant string data from R in its raw format, to allow the + caller to try to detect the encoding in use. + + Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP, + and *STRINGSP to an array of N elements allocated from POOL. For each I in + 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in + whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must + be a valid PSPP language identifier, false if *STRINGSP[I] is free-form + text. */ +size_t +sfm_get_strings (const struct sfm_reader *r, struct pool *pool, + char ***titlesp, bool **idsp, char ***stringsp) +{ + const struct sfm_mrset *mrset; + struct get_strings_aux aux; + size_t var_idx; + size_t i, j, k; + + aux.pool = pool; + aux.titles = NULL; + aux.strings = NULL; + aux.ids = NULL; + aux.allocated = 0; + aux.n = 0; + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx); + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + { + var_idx++; + if (r->vars[i].label) + add_string (&aux, r->vars[i].label, _("Variable %zu Label"), + var_idx); + } + + k = 0; + for (i = 0; i < r->n_labels; i++) + for (j = 0; j < r->labels[i].n_labels; j++) + add_string (&aux, r->labels[i].labels[j].label, + _("Value Label %zu"), k++); + + add_string (&aux, r->header.creation_date, _("Creation Date")); + add_string (&aux, r->header.creation_time, _("Creation Time")); + add_string (&aux, r->header.eye_catcher, _("Product")); + add_string (&aux, r->header.file_label, _("File Label")); + + if (r->extensions[EXT_PRODUCT_INFO]) + add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data, + _("Extra Product Info")); + + if (r->document) + { + size_t i; + + for (i = 0; i < r->document->n_lines; i++) + { + char line[81]; + + memcpy (line, r->document->documents + i * 80, 80); + line[80] = '\0'; + + add_string (&aux, line, _("Document Line %zu"), i + 1); + } + } + + for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++) + { + size_t mrset_idx = mrset - r->mrsets + 1; + + add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx); + if (mrset->label[0]) + add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx); + + /* Skip the variables because they ought to be duplicates. */ + + if (mrset->counted) + add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"), + mrset_idx); + } + + /* */ + /* data file attributes */ + /* variable attributes */ + /* long var map */ + /* long string value labels */ + /* long string missing values */ + + *titlesp = aux.titles; + *idsp = aux.ids; + *stringsp = aux.strings; + return aux.n; +} + /* Decodes the dictionary read from R, saving it into into *DICT. Character strings in R are decoded using ENCODING, or an encoding obtained from R if ENCODING is null, or the locale encoding if R specifies no encoding. @@ -588,7 +738,16 @@ sfm_decode (struct sfm_reader *r, const char *encoding, { encoding = sfm_get_encoding (r); if (encoding == NULL) - encoding = locale_charset (); + { + sys_warn (r, -1, _("This system file does not indicate its own " + "character encoding. Using default encoding " + "%s. For best results, specify an encoding " + "explicitly. Use SYSFILE INFO with " + "ENCODING=\"DETECT\" to analyze the possible " + "encodings."), + locale_charset ()); + encoding = locale_charset (); + } } dict = dict_create (encoding); @@ -906,7 +1065,7 @@ read_variable_record (struct sfm_reader *r, struct sfm_var_record *record) || !read_int (r, &record->missing_value_code) || !read_int (r, &record->print_format) || !read_int (r, &record->write_format) - || !read_bytes (r, record->name, sizeof record->name)) + || !read_string (r, record->name, sizeof record->name)) return false; if (has_variable_label == 1) @@ -1242,7 +1401,7 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, size_t i; name = recode_string_pool ("UTF-8", dict_encoding, - rec->name, 8, r->pool); + rec->name, -1, r->pool); name[strcspn (name, " ")] = '\0'; if (!dict_id_is_valid (dict, name, false) diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h index 254e810a6c..849da670fc 100644 --- a/src/data/sys-file-reader.h +++ b/src/data/sys-file-reader.h @@ -50,6 +50,8 @@ bool sfm_close (struct sfm_reader *); /* Obtaining information about an sfm_reader before . */ const char *sfm_get_encoding (const struct sfm_reader *); +size_t sfm_get_strings (const struct sfm_reader *, struct pool *pool, + char ***labels, bool **ids, char ***values); /* Decoding a system file's dictionary and obtaining a casereader. */ struct casereader *sfm_decode (struct sfm_reader *, const char *encoding, diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c index 6b74f75a66..05f9be91fb 100644 --- a/src/language/dictionary/sys-file-info.c +++ b/src/language/dictionary/sys-file-info.c @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -36,11 +37,16 @@ #include "language/lexer/lexer.h" #include "language/lexer/variable-parser.h" #include "libpspp/array.h" +#include "libpspp/hash-functions.h" +#include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/misc.h" +#include "libpspp/pool.h" #include "libpspp/string-array.h" #include "output/tab.h" +#include "output/text-item.h" +#include "gl/localcharset.h" #include "gl/minmax.h" #include "gl/xalloc.h" @@ -64,6 +70,9 @@ enum static int describe_variable (const struct variable *v, struct tab_table *t, int r, int pc, int flags); +static void report_encodings (const struct file_handle *, + const struct sfm_reader *); + /* SYSFILE INFO utility. */ int cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) @@ -118,6 +127,13 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) if (sfm_reader == NULL) goto error; + if (encoding && !strcasecmp (encoding, "detect")) + { + report_encodings (h, sfm_reader); + fh_unref (h); + return CMD_SUCCESS; + } + reader = sfm_decode (sfm_reader, encoding, &d, &info); if (!reader) goto error; @@ -191,7 +207,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) : info.compression == SFM_COMP_SIMPLE ? "SAV" : "ZSAV"); - tab_text (t, 0, r, TAB_LEFT, _("Charset:")); + tab_text (t, 0, r, TAB_LEFT, _("Encoding:")); tab_text (t, 1, r++, TAB_LEFT, dict_get_encoding (d)); tab_submit (t); @@ -745,3 +761,341 @@ display_vectors (const struct dictionary *dict, int sorted) free (vl); } + +/* Encoding analysis. */ + +/* This list of encodings is taken from http://encoding.spec.whatwg.org/, as + retrieved February 2014. Encodings not supported by glibc and encodings + relevant only to HTML have been removed. */ +static const char *encoding_names[] = { + "utf-8", + "windows-1252", + "iso-8859-2", + "iso-8859-3", + "iso-8859-4", + "iso-8859-5", + "iso-8859-6", + "iso-8859-7", + "iso-8859-8", + "iso-8859-10", + "iso-8859-13", + "iso-8859-14", + "iso-8859-16", + "macintosh", + "windows-874", + "windows-1250", + "windows-1251", + "windows-1253", + "windows-1254", + "windows-1255", + "windows-1256", + "windows-1257", + "windows-1258", + "koi8-r", + "koi8-u", + "ibm866", + "gb18030", + "big5", + "euc-jp", + "iso-2022-jp", + "shift_jis", + "euc-kr", +}; +#define N_ENCODING_NAMES (sizeof encoding_names / sizeof *encoding_names) + +struct encoding + { + uint64_t encodings; + char **utf8_strings; + unsigned int hash; + }; + +static char ** +recode_strings (struct pool *pool, + char **strings, bool *ids, size_t n, + const char *encoding) +{ + char **utf8_strings; + size_t i; + + utf8_strings = pool_alloc (pool, n * sizeof *utf8_strings); + for (i = 0; i < n; i++) + { + struct substring utf8; + int error; + + error = recode_pedantically ("UTF-8", encoding, ss_cstr (strings[i]), + pool, &utf8); + if (!error) + { + ss_rtrim (&utf8, ss_cstr (" ")); + utf8.string[utf8.length] = '\0'; + + if (ids[i] && !id_is_plausible (utf8.string, false)) + error = EINVAL; + } + + if (error) + return NULL; + + utf8_strings[i] = utf8.string; + } + + return utf8_strings; +} + +static struct encoding * +find_duplicate_encoding (struct encoding *encodings, size_t n_encodings, + char **utf8_strings, size_t n_strings, + unsigned int hash) +{ + struct encoding *e; + + for (e = encodings; e < &encodings[n_encodings]; e++) + { + int i; + + if (e->hash != hash) + goto next_encoding; + + for (i = 0; i < n_strings; i++) + if (strcmp (utf8_strings[i], e->utf8_strings[i])) + goto next_encoding; + + return e; + next_encoding:; + } + + return NULL; +} + +static bool +all_equal (const struct encoding *encodings, size_t n_encodings, + size_t string_idx) +{ + const char *s0; + size_t i; + + s0 = encodings[0].utf8_strings[string_idx]; + for (i = 1; i < n_encodings; i++) + if (strcmp (s0, encodings[i].utf8_strings[string_idx])) + return false; + + return true; +} + +static int +equal_prefix (const struct encoding *encodings, size_t n_encodings, + size_t string_idx) +{ + const char *s0; + size_t prefix; + size_t i; + + s0 = encodings[0].utf8_strings[string_idx]; + prefix = strlen (s0); + for (i = 1; i < n_encodings; i++) + { + const char *si = encodings[i].utf8_strings[string_idx]; + size_t j; + + for (j = 0; j < prefix; j++) + if (s0[j] != si[j]) + { + prefix = j; + if (!prefix) + return 0; + break; + } + } + + while (prefix > 0 && s0[prefix - 1] != ' ') + prefix--; + return prefix; +} + +static int +equal_suffix (const struct encoding *encodings, size_t n_encodings, + size_t string_idx) +{ + const char *s0; + size_t s0_len; + size_t suffix; + size_t i; + + s0 = encodings[0].utf8_strings[string_idx]; + s0_len = strlen (s0); + suffix = s0_len; + for (i = 1; i < n_encodings; i++) + { + const char *si = encodings[i].utf8_strings[string_idx]; + size_t si_len = strlen (si); + size_t j; + + if (si_len < suffix) + suffix = si_len; + for (j = 0; j < suffix; j++) + if (s0[s0_len - j - 1] != si[si_len - j - 1]) + { + suffix = j; + if (!suffix) + return 0; + break; + } + } + + while (suffix > 0 && s0[s0_len - suffix] != ' ') + suffix--; + return suffix; +} + +static void +report_encodings (const struct file_handle *h, const struct sfm_reader *r) +{ + char **titles; + char **strings; + bool *ids; + struct encoding encodings[N_ENCODING_NAMES]; + size_t n_encodings, n_strings, n_unique_strings; + size_t i, j; + struct tab_table *t; + struct text_item *text; + struct pool *pool; + size_t row; + + pool = pool_create (); + n_strings = sfm_get_strings (r, pool, &titles, &ids, &strings); + + n_encodings = 0; + for (i = 0; i < N_ENCODING_NAMES; i++) + { + char **utf8_strings; + struct encoding *e; + unsigned int hash; + + utf8_strings = recode_strings (pool, strings, ids, n_strings, + encoding_names[i]); + if (!utf8_strings) + continue; + + /* Hash utf8_strings. */ + hash = 0; + for (j = 0; j < n_strings; j++) + hash = hash_string (utf8_strings[j], hash); + + /* If there's a duplicate encoding, just mark it. */ + e = find_duplicate_encoding (encodings, n_encodings, + utf8_strings, n_strings, hash); + if (e) + { + e->encodings |= UINT64_C (1) << i; + continue; + } + + e = &encodings[n_encodings++]; + e->encodings = UINT64_C (1) << i; + e->utf8_strings = utf8_strings; + e->hash = hash; + } + if (!n_encodings) + { + msg (SW, _("No valid encodings found.")); + pool_destroy (pool); + return; + } + + text = text_item_create_format ( + TEXT_ITEM_PARAGRAPH, + _("The following table lists the encodings that can successfully read %s, " + "by specifying the encoding name on the GET command's ENCODING " + "subcommand. Encodings that yield identical text are listed " + "together."), fh_get_name (h)); + text_item_submit (text); + + t = tab_create (2, n_encodings + 1); + tab_title (t, _("Usable encodings for %s."), fh_get_name (h)); + tab_headers (t, 1, 0, 1, 0); + tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, 1, n_encodings); + tab_hline (t, TAL_1, 0, 1, 1); + tab_text (t, 0, 0, TAB_RIGHT, "#"); + tab_text (t, 1, 0, TAB_LEFT, _("Encodings")); + for (i = 0; i < n_encodings; i++) + { + struct string s; + + ds_init_empty (&s); + for (j = 0; j < 64; j++) + if (encodings[i].encodings & (UINT64_C (1) << j)) + ds_put_format (&s, "%s, ", encoding_names[j]); + ds_chomp (&s, ss_cstr (", ")); + + tab_text_format (t, 0, i + 1, TAB_RIGHT, "%d", i + 1); + tab_text (t, 1, i + 1, TAB_LEFT, ds_cstr (&s)); + ds_destroy (&s); + } + tab_submit (t); + + n_unique_strings = 0; + for (i = 0; i < n_strings; i++) + if (!all_equal (encodings, n_encodings, i)) + n_unique_strings++; + if (!n_unique_strings) + { + pool_destroy (pool); + return; + } + + text = text_item_create_format ( + TEXT_ITEM_PARAGRAPH, + _("The following table lists text strings in the file dictionary that " + "the encodings above interpret differently, along with those " + "interpretations.")); + text_item_submit (text); + + t = tab_create (3, (n_encodings * n_unique_strings) + 1); + tab_title (t, _("%s encoded text strings."), fh_get_name (h)); + tab_headers (t, 1, 0, 1, 0); + tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, 2, n_encodings * n_unique_strings); + tab_hline (t, TAL_1, 0, 2, 1); + + tab_text (t, 0, 0, TAB_LEFT, _("Purpose")); + tab_text (t, 1, 0, TAB_RIGHT, "#"); + tab_text (t, 2, 0, TAB_LEFT, _("Text")); + + row = 1; + for (i = 0; i < n_strings; i++) + if (!all_equal (encodings, n_encodings, i)) + { + int prefix = equal_prefix (encodings, n_encodings, i); + int suffix = equal_suffix (encodings, n_encodings, i); + + tab_joint_text (t, 0, row, 0, row + n_encodings - 1, + TAB_LEFT, titles[i]); + tab_hline (t, TAL_1, 0, 2, row); + for (j = 0; j < n_encodings; j++) + { + const char *s = encodings[j].utf8_strings[i] + prefix; + + tab_text_format (t, 1, row, TAB_RIGHT, "%d", j + 1); + if (prefix || suffix) + { + size_t len = strlen (s) - suffix; + struct string entry; + + ds_init_empty (&entry); + if (prefix) + ds_put_cstr (&entry, "..."); + ds_put_substring (&entry, ss_buffer (s, len)); + if (suffix) + ds_put_cstr (&entry, "..."); + tab_text (t, 2, row, TAB_LEFT, ds_cstr (&entry)); + } + else + tab_text (t, 2, row, TAB_LEFT, s); + row++; + } + } + tab_submit (t); + + pool_destroy (pool); +} diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 10b3927f9e..df7ae67da0 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -148,12 +148,10 @@ recode_string_len (const char *to, const char *from, Returns the output length if successful, -1 if the output buffer is too small. */ static ssize_t -try_recode (iconv_t conv, +try_recode (iconv_t conv, char fallbackchar, const char *in, size_t inbytes, char *out_, size_t outbytes) { - /* FIXME: Need to ensure that this char is valid in the target encoding */ - const char fallbackchar = '?'; char *out = out_; int i; @@ -181,14 +179,18 @@ try_recode (iconv_t conv, { case EINVAL: if (outbytes < 2) - return -1; + return -E2BIG; + if (!fallbackchar) + return -EINVAL; *out++ = fallbackchar; *out = '\0'; return out - out_; case EILSEQ: if (outbytes == 0) - return -1; + return -E2BIG; + if (!fallbackchar) + return -EILSEQ; *out++ = fallbackchar; outbytes--; if (inp) @@ -199,7 +201,7 @@ try_recode (iconv_t conv, break; case E2BIG: - return -1; + return -E2BIG; default: /* should never happen */ @@ -211,7 +213,7 @@ try_recode (iconv_t conv, } if (outbytes == 0) - return -1; + return -E2BIG; *out = '\0'; return out - out_; @@ -518,6 +520,57 @@ filename_to_utf8 (const char *filename) return recode_string ("UTF-8", filename_encoding (), filename, -1); } +static int +recode_substring_pool__ (const char *to, const char *from, + struct substring text, char fallbackchar, + struct pool *pool, struct substring *out) +{ + size_t bufsize; + iconv_t conv ; + + if (to == NULL) + to = default_encoding; + + if (from == NULL) + from = default_encoding; + + conv = create_iconv (to, from); + + if ( (iconv_t) -1 == conv ) + { + if (fallbackchar) + { + out->string = pool_malloc (pool, text.length + 1); + out->length = text.length; + memcpy (out->string, text.string, text.length); + out->string[out->length] = '\0'; + return 0; + } + else + return EPROTO; + } + + for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2) + { + char *output = pool_malloc (pool, bufsize); + ssize_t retval; + + retval = try_recode (conv, fallbackchar, text.string, text.length, + output, bufsize); + if (retval >= 0) + { + *out = ss_buffer (output, retval); + return 0; + } + pool_free (pool, output); + + if (retval != -E2BIG) + return -retval; + } + + NOT_REACHED (); +} + /* Converts the string TEXT, which should be encoded in FROM-encoding, to a dynamically allocated string in TO-encoding. Any characters which cannot be converted will be represented by '?'. @@ -533,42 +586,32 @@ struct substring recode_substring_pool (const char *to, const char *from, struct substring text, struct pool *pool) { - size_t outbufferlength; - iconv_t conv ; - - if (to == NULL) - to = default_encoding; - - if (from == NULL) - from = default_encoding; - - conv = create_iconv (to, from); + struct substring out; - if ( (iconv_t) -1 == conv ) - { - struct substring out; + recode_substring_pool__ (to, from, text, '?', pool, &out); + return out; +} - out.string = pool_malloc (pool, text.length + 1); - out.length = text.length; - memcpy (out.string, text.string, text.length); - out.string[out.length] = '\0'; - return out; - } +/* Converts the string TEXT, which should be encoded in FROM-encoding, to a + dynamically allocated string in TO-encoding. On success, returns 0, and the + converted null-terminated string, allocated from POOL with pool_malloc(), is + stored in *OUT. On failure, returns a positive errno value. - for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 ) - if ( outbufferlength > text.length) - { - char *output = pool_malloc (pool, outbufferlength); - ssize_t output_len = try_recode (conv, text.string, text.length, - output, outbufferlength); - if (output_len >= 0) - return ss_buffer (output, output_len); - pool_free (pool, output); - } + The function fails with an error if any part of the input string is not + valid in the declared input encoding. */ +int +recode_pedantically (const char *to, const char *from, + struct substring text, struct pool *pool, + struct substring *out) +{ + int error; - NOT_REACHED (); + error = recode_substring_pool__ (to, from, text, 0, pool, out); + if (error) + *out = ss_empty (); + return error; } - + void i18n_init (void) { diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index 6722b5cec9..54717bcaa9 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2006, 2010, 2011, 2012, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,6 +40,9 @@ char *recode_string_pool (const char *to, const char *from, const char *text, int length, struct pool *); struct substring recode_substring_pool (const char *to, const char *from, struct substring text, struct pool *); +int recode_pedantically (const char *to, const char *from, + struct substring text, struct pool *, + struct substring *out); size_t recode_string_len (const char *to, const char *from, const char *text, int len); diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at index c4d16b3294..c63e751f23 100644 --- a/tests/data/sys-file-reader.at +++ b/tests/data/sys-file-reader.at @@ -1530,6 +1530,42 @@ AT_CLEANUP AT_BANNER([system file reader - negative]) +AT_SETUP([unspecified character encoding]) +AT_KEYWORDS([sack synthetic system file positive]) +AT_DATA([sys-file.sack], [dnl +dnl File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; dnl Layout code +4; dnl Nominal case size +0; dnl Not compressed +0; dnl Not weighted +0; dnl No cases. +100.0; dnl Bias. +"01 Jan 11"; "20:53:52"; +"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; +i8 0 *3; + +dnl Numeric variables. +2; 0; 0; 0; 0x050800 *2; s8 "A"; +2; 0; 0; 0; 0x050800 *2; s8 "B"; +2; 0; 0; 0; 0x050800 *2; s8 "C"; +2; 0; 0; 0; 0x050800 *2; s8 "D"; + +dnl Dictionary termination record. +999; 0; +]) +for variant in be le; do + AT_CHECK([sack --$variant sys-file.sack > sys-file.sav]) + AT_DATA([sys-file.sps], [dnl +GET 'sys-file.sav'. +]) + AT_CHECK([pspp -O format=csv sys-file.sps], [0], [stdout]) + AT_CHECK([sed 's/default encoding.*For/default encoding. For/' stdout], [0], [dnl +"warning: `sys-file.sav': This system file does not indicate its own character encoding. Using default encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=""DETECT"" to analyze the possible encodings." +]) +done +AT_CLEANUP + AT_SETUP([misplaced type 4 record]) AT_KEYWORDS([sack synthetic system file negative]) AT_DATA([sys-file.sack], [dnl diff --git a/tests/data/sys-file.at b/tests/data/sys-file.at index 033a7da9af..bd5671cb01 100644 --- a/tests/data/sys-file.at +++ b/tests/data/sys-file.at @@ -321,7 +321,7 @@ AT_BANNER([system files -- very long strings]) AT_SETUP([read very long strings written by SPSS 13]) AT_CHECK([cp $top_srcdir/tests/data/v13.sav .]) AT_DATA([sys-file.sps], [dnl -GET FILE='v13.sav'. +GET FILE='v13.sav' ENCODING='utf-8'. DISPLAY VARIABLES. LIST. ]) @@ -355,7 +355,7 @@ AT_CLEANUP AT_SETUP([read very long strings written by SPSS 14]) AT_CHECK([cp $top_srcdir/tests/data/v14.sav .]) AT_DATA([sys-file.sps], [dnl -GET FILE='v14.sav'. +GET FILE='v14.sav' ENCODING='utf-8'. DISPLAY VARIABLES. LIST. ]) diff --git a/tests/language/dictionary/sys-file-info.at b/tests/language/dictionary/sys-file-info.at index 6a5e4adfef..45eb3afbdf 100644 --- a/tests/language/dictionary/sys-file-info.at +++ b/tests/language/dictionary/sys-file-info.at @@ -18,7 +18,7 @@ AT_CHECK( -e '/^Endian:,/d' \ -e '/^Integer Format:,/d' \ -e '/^Real Format:,/d' \ - -e '/^Charset:,/d' pspp.csv], + -e '/^Encoding:,/d' pspp.csv], [0], [dnl Table: Reading free-form data from INLINE. Variable,Format