From: Ben Pfaff <blp@cs.stanford.edu>
Date: Thu, 1 Mar 2012 06:43:22 +0000 (-0800)
Subject: encoding-guesser: Fall back to windows-1252 when UTF-8 can't be right.
X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=d6c75296e5573a997c79a7af1195b6a619c0190c

encoding-guesser: Fall back to windows-1252 when UTF-8 can't be right.

Until now the encoding-guesser code has used UTF-8 as a fallback in
situations where we can tell that the file is not valid UTF-8.  In
this kind of situation having a single-byte character set as a
fallback makes more sense.  This commit hard-codes windows-1252 as
that fallback, since it is a widely encountered encoding (and
compatible with ISO-8859-1 as well).

John Darrington originally suggested this, if I recall correctly.

The bug report that spurred this work was from Harry Thijssen.  With
this commit, PSPP properly reads his windows-1252 file when the
system locale uses UTF-8 encoding.
---

diff --git a/doc/utilities.texi b/doc/utilities.texi
index 40648d430c..35dd393a78 100644
--- a/doc/utilities.texi
+++ b/doc/utilities.texi
@@ -313,14 +313,23 @@ are @code{ASCII} (United States), @code{ISO-8859-1} (western Europe),
 @code{EUC-JP} (Japan), and @code{windows-1252} (Windows).  Not all
 systems support all character sets.
 
-@item @code{Auto}
 @item @code{Auto,@var{encoding}}
-Automatically detects whether a syntax file is encoded in
-@var{encoding} or in a Unicode encoding such as UTF-8, UTF-16, or
-UTF-32.  The @var{encoding} may be an IANA character set name or
-@code{Locale} (the default).  Only ASCII compatible encodings can
-automatically be distinguished from UTF-8 (the most common locale
-encodings are all ASCII-compatible).
+Automatically detects whether a syntax file is encoded in an Unicode
+encoding such as UTF-8, UTF-16, or UTF-32.  If it is not, then PSPP
+generally assumes that the file is encoded in @var{encoding} (an IANA
+character set name).  However, if @var{encoding} is UTF-8, and the
+syntax file is not valid UTF-8, PSPP instead assumes that the file
+is encoded in @code{windows-1252}.
+
+For best results, @var{encoding} should be an ASCII-compatible
+encoding (the most common locale encodings are all ASCII-compatible),
+because encodings that are not ASCII compatible cannot be
+automatically distinguished from UTF-8.
+
+@item @code{Auto}
+@item @code{Auto,Locale}
+Automatic detection, as above, with the default encoding taken from
+the system locale or the setting on SET LOCALE.
 @end table
 
 When ENCODING is not specified, the default is taken from the
diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c
index 27e2cda5dc..bee29782f2 100644
--- a/src/libpspp/encoding-guesser.c
+++ b/src/libpspp/encoding-guesser.c
@@ -36,22 +36,26 @@
    of information about encoding detection.
 */
 
-/* Parses and returns the fallback encoding from ENCODING, which must be in one
-   of the forms described at the top of encoding-guesser.h.  The returned
-   string might be ENCODING itself or a suffix of it, or it might be a
-   statically allocated string. */
+/* Returns the encoding specified by ENCODING, which must be in one of the
+   forms described at the top of encoding-guesser.h.  The returned string might
+   be ENCODING itself or a suffix of it, or it might be a statically allocated
+   string. */
 const char *
 encoding_guess_parse_encoding (const char *encoding)
 {
+  const char *fallback;
+
   if (encoding == NULL
       || !c_strcasecmp (encoding, "auto")
       || !c_strcasecmp (encoding, "auto,locale")
       || !c_strcasecmp (encoding, "locale"))
-    return locale_charset ();
+    fallback = locale_charset ();
   else if (!c_strncasecmp (encoding, "auto,", 5))
-    return encoding + 5;
+    fallback = encoding + 5;
   else
     return encoding;
+
+  return is_encoding_utf8 (fallback) ? "windows-1252" : fallback;
 }
 
 /* Returns true if ENCODING, which must be in one of the forms described at the
@@ -267,16 +271,37 @@ const char *
 encoding_guess_tail_encoding (const char *encoding,
                               const void *data, size_t n)
 {
-  return (encoding_guess_tail_is_utf8 (data, n)
+  return (encoding_guess_tail_is_utf8 (data, n) != 0
           ? "UTF-8"
           : encoding_guess_parse_encoding (encoding));
 }
 
-/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false
-   for the fallback encoding. */
-bool
+/* Returns an encoding guess based on ENCODING and the N bytes of text starting
+   at DATA.  DATA should start with the first non-ASCII text character (as
+   determined by encoding_guess_is_ascii_text()) found in the input.
+
+   The return value is:
+
+       0, if the encoding is definitely not UTF-8 (because the input contains
+       byte sequences that are not valid in UTF-8).
+
+       1, if the encoding appears to be UTF-8 (because the input contains valid
+       UTF-8 multibyte sequences).
+
+       -1, if the input contains only ASCII characters.  (This means that the
+       input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
+
+   See encoding-guesser.h for intended use of this function.
+
+   N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
+   that starting with the first non-ASCII text character. */
+int
 encoding_guess_tail_is_utf8 (const void *data, size_t n)
 {
+  /* If all the bytes are in the ASCII range, it's just ASCII. */
+  if (encoding_guess_count_ascii (data, n) == n)
+    return -1;
+
   return (n < ENCODING_GUESS_MIN
           ? u8_check (data, n) == NULL
           : is_all_utf8_text (data, n));
@@ -297,15 +322,7 @@ encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
 
   guess = encoding_guess_head_encoding (encoding, text, size);
   if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
-    {
-      size_t ofs = encoding_guess_count_ascii (text, size);
-      if (ofs < size)
-        return encoding_guess_tail_encoding (encoding,
-                                             (const char *) text + ofs,
-                                             size - ofs);
-      else
-        return encoding_guess_parse_encoding (encoding);
-    }
+    return encoding_guess_tail_encoding (encoding, text, size);
   else
     return guess;
 }
diff --git a/src/libpspp/encoding-guesser.h b/src/libpspp/encoding-guesser.h
index 0a7d1f99a4..2e8cb9abbb 100644
--- a/src/libpspp/encoding-guesser.h
+++ b/src/libpspp/encoding-guesser.h
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 2011 Free Software Foundation, Inc.
+   Copyright (C) 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -42,7 +42,9 @@
        encoding"): Requests detection whether the input is encoded in UTF-8,
        UTF-16, UTF-32, or a few other easily identifiable charsets.  When a
        particular character set cannot be recognized, the guesser falls back to
-       the encoding following the comma.  UTF-8 detection works only for
+       the encoding following the comma.  When the fallback encoding is UTF-8,
+       but the input is invalid UTF-8, then the windows-1252 encoding (closely
+       related to ISO 8859-1) is used instead.  UTF-8 detection works only for
        ASCII-compatible character sets.
 
      - NULL or "Auto": As above, with the encoding used by the system locale as
@@ -111,7 +113,7 @@ const char *encoding_guess_head_encoding (const char *encoding,
 /* Refining an initial ASCII coding guess using later non-ASCII bytes. */
 static inline bool encoding_guess_is_ascii_text (uint8_t c);
 size_t encoding_guess_count_ascii (const void *, size_t);
-bool encoding_guess_tail_is_utf8 (const void *, size_t);
+int encoding_guess_tail_is_utf8 (const void *, size_t);
 const char *encoding_guess_tail_encoding (const char *encoding,
                                           const void *, size_t);
 
diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c
index 9658866056..c04dd5acaf 100644
--- a/src/libpspp/i18n.c
+++ b/src/libpspp/i18n.c
@@ -769,3 +769,17 @@ is_encoding_supported (const char *encoding)
   return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
           && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
 }
+
+/* Returns true if E is the name of a UTF-8 encoding.
+
+   XXX Possibly we should test not E as a string but its properties via
+   iconv. */
+bool
+is_encoding_utf8 (const char *e)
+{
+  return ((e[0] == 'u' || e[0] == 'U')
+          && (e[1] == 't' || e[1] == 'T')
+          && (e[2] == 'f' || e[2] == 'F')
+          && ((e[3] == '8' && e[4] == '\0')
+              || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
+}
diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h
index 383ff12da5..d973a81b1f 100644
--- a/src/libpspp/i18n.h
+++ b/src/libpspp/i18n.h
@@ -142,4 +142,6 @@ bool is_encoding_ascii_compatible (const char *encoding);
 bool is_encoding_ebcdic_compatible (const char *encoding);
 bool is_encoding_supported (const char *encoding);
 
+bool is_encoding_utf8 (const char *encoding);
+
 #endif /* i18n.h */
diff --git a/src/libpspp/u8-istream.c b/src/libpspp/u8-istream.c
index c11163435f..77c14133ea 100644
--- a/src/libpspp/u8-istream.c
+++ b/src/libpspp/u8-istream.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -34,6 +34,7 @@
 #include "libpspp/cast.h"
 #include "libpspp/compiler.h"
 #include "libpspp/encoding-guesser.h"
+#include "libpspp/i18n.h"
 
 #include "gl/c-strcase.h"
 #include "gl/localcharset.h"
@@ -120,7 +121,7 @@ u8_istream_for_fd (const char *fromcode, int fd)
     goto error;
 
   encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length);
-  if (!strcmp (encoding, "UTF-8"))
+  if (is_encoding_utf8 (encoding))
     is->state = S_UTF8;
   else
     {
diff --git a/tests/libpspp/encoding-guesser.at b/tests/libpspp/encoding-guesser.at
index a2b0aabd9d..e969a48aca 100644
--- a/tests/libpspp/encoding-guesser.at
+++ b/tests/libpspp/encoding-guesser.at
@@ -141,3 +141,11 @@ AT_CHECK([printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\3
   [0], [UTF-8
 ])
 AT_CLEANUP
+
+AT_SETUP([windows-1252 as Auto,UTF-8])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([i18n-test supports_encodings windows-1252])
+AT_CHECK([printf 'entr\351e' | encoding-guesser-test Auto,UTF-8 32], [0],
+  [windows-1252
+])
+AT_CLEANUP