From: Ben Pfaff Date: Thu, 1 Mar 2012 06:43:22 +0000 (-0800) Subject: encoding-guesser: Fall back to windows-1252 when UTF-8 can't be right. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=d6c75296e5573a997c79a7af1195b6a619c0190c encoding-guesser: Fall back to windows-1252 when UTF-8 can't be right. Until now the encoding-guesser code has used UTF-8 as a fallback in situations where we can tell that the file is not valid UTF-8. In this kind of situation having a single-byte character set as a fallback makes more sense. This commit hard-codes windows-1252 as that fallback, since it is a widely encountered encoding (and compatible with ISO-8859-1 as well). John Darrington originally suggested this, if I recall correctly. The bug report that spurred this work was from Harry Thijssen. With this commit, PSPP properly reads his windows-1252 file when the system locale uses UTF-8 encoding. --- diff --git a/doc/utilities.texi b/doc/utilities.texi index 40648d430c..35dd393a78 100644 --- a/doc/utilities.texi +++ b/doc/utilities.texi @@ -313,14 +313,23 @@ are @code{ASCII} (United States), @code{ISO-8859-1} (western Europe), @code{EUC-JP} (Japan), and @code{windows-1252} (Windows). Not all systems support all character sets. -@item @code{Auto} @item @code{Auto,@var{encoding}} -Automatically detects whether a syntax file is encoded in -@var{encoding} or in a Unicode encoding such as UTF-8, UTF-16, or -UTF-32. The @var{encoding} may be an IANA character set name or -@code{Locale} (the default). Only ASCII compatible encodings can -automatically be distinguished from UTF-8 (the most common locale -encodings are all ASCII-compatible). +Automatically detects whether a syntax file is encoded in an Unicode +encoding such as UTF-8, UTF-16, or UTF-32. If it is not, then PSPP +generally assumes that the file is encoded in @var{encoding} (an IANA +character set name). However, if @var{encoding} is UTF-8, and the +syntax file is not valid UTF-8, PSPP instead assumes that the file +is encoded in @code{windows-1252}. + +For best results, @var{encoding} should be an ASCII-compatible +encoding (the most common locale encodings are all ASCII-compatible), +because encodings that are not ASCII compatible cannot be +automatically distinguished from UTF-8. + +@item @code{Auto} +@item @code{Auto,Locale} +Automatic detection, as above, with the default encoding taken from +the system locale or the setting on SET LOCALE. @end table When ENCODING is not specified, the default is taken from the diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c index 27e2cda5dc..bee29782f2 100644 --- a/src/libpspp/encoding-guesser.c +++ b/src/libpspp/encoding-guesser.c @@ -36,22 +36,26 @@ of information about encoding detection. */ -/* Parses and returns the fallback encoding from ENCODING, which must be in one - of the forms described at the top of encoding-guesser.h. The returned - string might be ENCODING itself or a suffix of it, or it might be a - statically allocated string. */ +/* Returns the encoding specified by ENCODING, which must be in one of the + forms described at the top of encoding-guesser.h. The returned string might + be ENCODING itself or a suffix of it, or it might be a statically allocated + string. */ const char * encoding_guess_parse_encoding (const char *encoding) { + const char *fallback; + if (encoding == NULL || !c_strcasecmp (encoding, "auto") || !c_strcasecmp (encoding, "auto,locale") || !c_strcasecmp (encoding, "locale")) - return locale_charset (); + fallback = locale_charset (); else if (!c_strncasecmp (encoding, "auto,", 5)) - return encoding + 5; + fallback = encoding + 5; else return encoding; + + return is_encoding_utf8 (fallback) ? "windows-1252" : fallback; } /* Returns true if ENCODING, which must be in one of the forms described at the @@ -267,16 +271,37 @@ const char * encoding_guess_tail_encoding (const char *encoding, const void *data, size_t n) { - return (encoding_guess_tail_is_utf8 (data, n) + return (encoding_guess_tail_is_utf8 (data, n) != 0 ? "UTF-8" : encoding_guess_parse_encoding (encoding)); } -/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false - for the fallback encoding. */ -bool +/* Returns an encoding guess based on ENCODING and the N bytes of text starting + at DATA. DATA should start with the first non-ASCII text character (as + determined by encoding_guess_is_ascii_text()) found in the input. + + The return value is: + + 0, if the encoding is definitely not UTF-8 (because the input contains + byte sequences that are not valid in UTF-8). + + 1, if the encoding appears to be UTF-8 (because the input contains valid + UTF-8 multibyte sequences). + + -1, if the input contains only ASCII characters. (This means that the + input may be treated as UTF-8, since ASCII is a subset of UTF-8.) + + See encoding-guesser.h for intended use of this function. + + N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than + that starting with the first non-ASCII text character. */ +int encoding_guess_tail_is_utf8 (const void *data, size_t n) { + /* If all the bytes are in the ASCII range, it's just ASCII. */ + if (encoding_guess_count_ascii (data, n) == n) + return -1; + return (n < ENCODING_GUESS_MIN ? u8_check (data, n) == NULL : is_all_utf8_text (data, n)); @@ -297,15 +322,7 @@ encoding_guess_whole_file (const char *encoding, const void *text, size_t size) guess = encoding_guess_head_encoding (encoding, text, size); if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding)) - { - size_t ofs = encoding_guess_count_ascii (text, size); - if (ofs < size) - return encoding_guess_tail_encoding (encoding, - (const char *) text + ofs, - size - ofs); - else - return encoding_guess_parse_encoding (encoding); - } + return encoding_guess_tail_encoding (encoding, text, size); else return guess; } diff --git a/src/libpspp/encoding-guesser.h b/src/libpspp/encoding-guesser.h index 0a7d1f99a4..2e8cb9abbb 100644 --- a/src/libpspp/encoding-guesser.h +++ b/src/libpspp/encoding-guesser.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2011 Free Software Foundation, Inc. + Copyright (C) 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -42,7 +42,9 @@ encoding"): Requests detection whether the input is encoded in UTF-8, UTF-16, UTF-32, or a few other easily identifiable charsets. When a particular character set cannot be recognized, the guesser falls back to - the encoding following the comma. UTF-8 detection works only for + the encoding following the comma. When the fallback encoding is UTF-8, + but the input is invalid UTF-8, then the windows-1252 encoding (closely + related to ISO 8859-1) is used instead. UTF-8 detection works only for ASCII-compatible character sets. - NULL or "Auto": As above, with the encoding used by the system locale as @@ -111,7 +113,7 @@ const char *encoding_guess_head_encoding (const char *encoding, /* Refining an initial ASCII coding guess using later non-ASCII bytes. */ static inline bool encoding_guess_is_ascii_text (uint8_t c); size_t encoding_guess_count_ascii (const void *, size_t); -bool encoding_guess_tail_is_utf8 (const void *, size_t); +int encoding_guess_tail_is_utf8 (const void *, size_t); const char *encoding_guess_tail_encoding (const char *encoding, const void *, size_t); diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 9658866056..c04dd5acaf 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -769,3 +769,17 @@ is_encoding_supported (const char *encoding) return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1 && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1); } + +/* Returns true if E is the name of a UTF-8 encoding. + + XXX Possibly we should test not E as a string but its properties via + iconv. */ +bool +is_encoding_utf8 (const char *e) +{ + return ((e[0] == 'u' || e[0] == 'U') + && (e[1] == 't' || e[1] == 'T') + && (e[2] == 'f' || e[2] == 'F') + && ((e[3] == '8' && e[4] == '\0') + || (e[3] == '-' && e[4] == '8' && e[5] == '\0'))); +} diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index 383ff12da5..d973a81b1f 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -142,4 +142,6 @@ bool is_encoding_ascii_compatible (const char *encoding); bool is_encoding_ebcdic_compatible (const char *encoding); bool is_encoding_supported (const char *encoding); +bool is_encoding_utf8 (const char *encoding); + #endif /* i18n.h */ diff --git a/src/libpspp/u8-istream.c b/src/libpspp/u8-istream.c index c11163435f..77c14133ea 100644 --- a/src/libpspp/u8-istream.c +++ b/src/libpspp/u8-istream.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +34,7 @@ #include "libpspp/cast.h" #include "libpspp/compiler.h" #include "libpspp/encoding-guesser.h" +#include "libpspp/i18n.h" #include "gl/c-strcase.h" #include "gl/localcharset.h" @@ -120,7 +121,7 @@ u8_istream_for_fd (const char *fromcode, int fd) goto error; encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length); - if (!strcmp (encoding, "UTF-8")) + if (is_encoding_utf8 (encoding)) is->state = S_UTF8; else { diff --git a/tests/libpspp/encoding-guesser.at b/tests/libpspp/encoding-guesser.at index a2b0aabd9d..e969a48aca 100644 --- a/tests/libpspp/encoding-guesser.at +++ b/tests/libpspp/encoding-guesser.at @@ -141,3 +141,11 @@ AT_CHECK([printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\3 [0], [UTF-8 ]) AT_CLEANUP + +AT_SETUP([windows-1252 as Auto,UTF-8]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([i18n-test supports_encodings windows-1252]) +AT_CHECK([printf 'entr\351e' | encoding-guesser-test Auto,UTF-8 32], [0], + [windows-1252 +]) +AT_CLEANUP