X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flibpspp%2Fencoding-guesser.c;h=bee29782f2a5e582d4cc7c463c4b8d269de17799;hb=ddf35b11f8525281dd9e524a5d322a80bf56a24f;hp=9042e93a2fd41db02012d19f206c378748adc7c8;hpb=f3668539947d5baed813a4f8436d6cf36abeedd2;p=pspp

diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c
index 9042e93a2f..bee29782f2 100644
--- a/src/libpspp/encoding-guesser.c
+++ b/src/libpspp/encoding-guesser.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 2011 Free Software Foundation, Inc.
+   Copyright (C) 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -36,22 +36,26 @@
    of information about encoding detection.
 */
 
-/* Parses and returns the fallback encoding from ENCODING, which must be in one
-   of the forms described at the top of encoding-guesser.h.  The returned
-   string might be ENCODING itself or a suffix of it, or it might be a
-   statically allocated string. */
+/* Returns the encoding specified by ENCODING, which must be in one of the
+   forms described at the top of encoding-guesser.h.  The returned string might
+   be ENCODING itself or a suffix of it, or it might be a statically allocated
+   string. */
 const char *
 encoding_guess_parse_encoding (const char *encoding)
 {
+  const char *fallback;
+
   if (encoding == NULL
       || !c_strcasecmp (encoding, "auto")
       || !c_strcasecmp (encoding, "auto,locale")
       || !c_strcasecmp (encoding, "locale"))
-    return locale_charset ();
+    fallback = locale_charset ();
   else if (!c_strncasecmp (encoding, "auto,", 5))
-    return encoding + 5;
+    fallback = encoding + 5;
   else
     return encoding;
+
+  return is_encoding_utf8 (fallback) ? "windows-1252" : fallback;
 }
 
 /* Returns true if ENCODING, which must be in one of the forms described at the
@@ -88,7 +92,6 @@ static uint32_t
 get_le32 (const uint8_t *data)
 {
   return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
-
 }
 
 static const char *
@@ -250,10 +253,6 @@ encoding_guess_head_encoding (const char *encoding,
       || !encoding_guess_tail_is_utf8 (data, n))
     return fallback_encoding;
 
-  if (!c_strcasecmp (fallback_encoding, "UTF-8")
-      || !c_strcasecmp (fallback_encoding, "UTF8"))
-    return "UTF-8";
-
   return "ASCII";
 }
 
@@ -272,18 +271,58 @@ const char *
 encoding_guess_tail_encoding (const char *encoding,
                               const void *data, size_t n)
 {
-  return (encoding_guess_tail_is_utf8 (data, n)
+  return (encoding_guess_tail_is_utf8 (data, n) != 0
           ? "UTF-8"
           : encoding_guess_parse_encoding (encoding));
 }
 
-/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false
-   for the fallback encoding. */
-bool
+/* Returns an encoding guess based on ENCODING and the N bytes of text starting
+   at DATA.  DATA should start with the first non-ASCII text character (as
+   determined by encoding_guess_is_ascii_text()) found in the input.
+
+   The return value is:
+
+       0, if the encoding is definitely not UTF-8 (because the input contains
+       byte sequences that are not valid in UTF-8).
+
+       1, if the encoding appears to be UTF-8 (because the input contains valid
+       UTF-8 multibyte sequences).
+
+       -1, if the input contains only ASCII characters.  (This means that the
+       input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
+
+   See encoding-guesser.h for intended use of this function.
+
+   N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
+   that starting with the first non-ASCII text character. */
+int
 encoding_guess_tail_is_utf8 (const void *data, size_t n)
 {
+  /* If all the bytes are in the ASCII range, it's just ASCII. */
+  if (encoding_guess_count_ascii (data, n) == n)
+    return -1;
+
   return (n < ENCODING_GUESS_MIN
           ? u8_check (data, n) == NULL
           : is_all_utf8_text (data, n));
 }
 
+/* Attempts to guess the encoding of a text file based on ENCODING, an encoding
+   name in one of the forms described at the top of encoding-guesser.h, and the
+   SIZE byts in DATA, which contains the entire contents of the file.  Returns
+   the guessed encoding, which might be ENCODING itself or a suffix of it or a
+   statically allocated string.
+
+   Encoding autodetection only takes place if ENCODING actually specifies
+   autodetection.  See encoding-guesser.h for details. */
+const char *
+encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
+{
+  const char *guess;
+
+  guess = encoding_guess_head_encoding (encoding, text, size);
+  if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
+    return encoding_guess_tail_encoding (encoding, text, size);
+  else
+    return guess;
+}