Fixed many warnings

[pspp] / src / libpspp / encoding-guesser.c
diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c

index 9042e93a2fd41db02012d19f206c378748adc7c8..bee29782f2a5e582d4cc7c463c4b8d269de17799 100644 (file)
--- a/src/libpspp/encoding-guesser.c
+++ b/src/libpspp/encoding-guesser.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2011 Free Software Foundation, Inc.
+   Copyright (C) 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -36,22 +36,26 @@
     of information about encoding detection.
  */
  
-/* Parses and returns the fallback encoding from ENCODING, which must be in one
-   of the forms described at the top of encoding-guesser.h.  The returned
-   string might be ENCODING itself or a suffix of it, or it might be a
-   statically allocated string. */
+/* Returns the encoding specified by ENCODING, which must be in one of the
+   forms described at the top of encoding-guesser.h.  The returned string might
+   be ENCODING itself or a suffix of it, or it might be a statically allocated
+   string. */
  const char *
  encoding_guess_parse_encoding (const char *encoding)
  {
+  const char *fallback;
+
    if (encoding == NULL
        || !c_strcasecmp (encoding, "auto")
        || !c_strcasecmp (encoding, "auto,locale")
        || !c_strcasecmp (encoding, "locale"))
-    return locale_charset ();
+    fallback = locale_charset ();
    else if (!c_strncasecmp (encoding, "auto,", 5))
-    return encoding + 5;
+    fallback = encoding + 5;
    else
      return encoding;
+
+  return is_encoding_utf8 (fallback) ? "windows-1252" : fallback;
  }
  
  /* Returns true if ENCODING, which must be in one of the forms described at the
@@ -88,7 +92,6 @@ static uint32_t
  get_le32 (const uint8_t *data)
  {
    return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
-
  }
  
  static const char *
@@ -250,10 +253,6 @@ encoding_guess_head_encoding (const char *encoding,
        || !encoding_guess_tail_is_utf8 (data, n))
      return fallback_encoding;
  
-  if (!c_strcasecmp (fallback_encoding, "UTF-8")
-      || !c_strcasecmp (fallback_encoding, "UTF8"))
-    return "UTF-8";
-
    return "ASCII";
  }
  
@@ -272,18 +271,58 @@ const char *
  encoding_guess_tail_encoding (const char *encoding,
                                const void *data, size_t n)
  {
-  return (encoding_guess_tail_is_utf8 (data, n)
+  return (encoding_guess_tail_is_utf8 (data, n) != 0
            ? "UTF-8"
            : encoding_guess_parse_encoding (encoding));
  }
  
-/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false
-   for the fallback encoding. */
-bool
+/* Returns an encoding guess based on ENCODING and the N bytes of text starting
+   at DATA.  DATA should start with the first non-ASCII text character (as
+   determined by encoding_guess_is_ascii_text()) found in the input.
+
+   The return value is:
+
+       0, if the encoding is definitely not UTF-8 (because the input contains
+       byte sequences that are not valid in UTF-8).
+
+       1, if the encoding appears to be UTF-8 (because the input contains valid
+       UTF-8 multibyte sequences).
+
+       -1, if the input contains only ASCII characters.  (This means that the
+       input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
+
+   See encoding-guesser.h for intended use of this function.
+
+   N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
+   that starting with the first non-ASCII text character. */
+int
  encoding_guess_tail_is_utf8 (const void *data, size_t n)
  {
+  /* If all the bytes are in the ASCII range, it's just ASCII. */
+  if (encoding_guess_count_ascii (data, n) == n)
+    return -1;
+
    return (n < ENCODING_GUESS_MIN
            ? u8_check (data, n) == NULL
            : is_all_utf8_text (data, n));
  }
  
+/* Attempts to guess the encoding of a text file based on ENCODING, an encoding
+   name in one of the forms described at the top of encoding-guesser.h, and the
+   SIZE byts in DATA, which contains the entire contents of the file.  Returns
+   the guessed encoding, which might be ENCODING itself or a suffix of it or a
+   statically allocated string.
+
+   Encoding autodetection only takes place if ENCODING actually specifies
+   autodetection.  See encoding-guesser.h for details. */
+const char *
+encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
+{
+  const char *guess;
+
+  guess = encoding_guess_head_encoding (encoding, text, size);
+  if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
+    return encoding_guess_tail_encoding (encoding, text, size);
+  else
+    return guess;
+}