encoding-guesser: Avoid reading uninitialized data for zero-length files.

[pspp] / src / libpspp / encoding-guesser.c
diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c

index b55f24a1de442db23f2cc603f60d7333853bcaab..2f3438ff811ad954359261686871cd061db7bc48 100644 (file)
--- a/src/libpspp/encoding-guesser.c
+++ b/src/libpspp/encoding-guesser.c
@@ -43,19 +43,15 @@
  const char *
  encoding_guess_parse_encoding (const char *encoding)
  {
-  const char *fallback;
-
    if (encoding == NULL
        || !c_strcasecmp (encoding, "auto")
        || !c_strcasecmp (encoding, "auto,locale")
        || !c_strcasecmp (encoding, "locale"))
-    fallback = locale_charset ();
+    return locale_charset ();
    else if (!c_strncasecmp (encoding, "auto,", 5))
-    fallback = encoding + 5;
+    return encoding + 5;
    else
      return encoding;
-
-  return is_encoding_utf8 (fallback) ? "windows-1252" : fallback;
  }
  
  /* Returns true if ENCODING, which must be in one of the forms described at the
@@ -193,28 +189,34 @@ is_utf8_bom (const uint8_t *data, size_t n)
    return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf;
  }
  
+static bool
+is_bom_length (size_t n, size_t w)
+{
+  return n >= ENCODING_GUESS_MIN || (n && n % w == 0);
+}
+
  static bool
  is_utf16le_bom (const uint8_t *data, size_t n)
  {
-  return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_le16 (data) == 0xfeff;
+  return is_bom_length (n, 2) && get_le16 (data) == 0xfeff;
  }
  
  static bool
  is_utf16be_bom (const uint8_t *data, size_t n)
  {
-  return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_be16 (data) == 0xfeff;
+  return is_bom_length (n, 2) && get_be16 (data) == 0xfeff;
  }
  
  static bool
  is_utf32le_bom (const uint8_t *data, size_t n)
  {
-  return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_le32 (data) == 0xfeff;
+  return is_bom_length (n, 4) && get_le32 (data) == 0xfeff;
  }
  
  static bool
  is_utf32be_bom (const uint8_t *data, size_t n)
  {
-  return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_be32 (data) == 0xfeff;
+  return is_bom_length (n, 4) && get_be32 (data) == 0xfeff;
  }
  
  /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
@@ -277,10 +279,25 @@ encoding_guess_head_encoding (const char *encoding,
    if (is_utf32 (data, n, get_le32))
      return "UTF-32LE";
  
-  if (!is_encoding_ascii_compatible (fallback_encoding)
-      || !encoding_guess_tail_is_utf8 (data, n))
+  /* We've tried all the "giveaways" that make the encoding obvious.  That
+     rules out, incidentally, all the encodings with multibyte units
+     (e.g. UTF-16, UTF-32).  Our remaining goal is to try to distinguish UTF-8
+     from some ASCII-based fallback encoding. */
+
+  /* If the fallback encoding isn't ASCII compatible, give up. */
+  if (!is_encoding_ascii_compatible (fallback_encoding))
      return fallback_encoding;
  
+  /* If the data we have clearly is not UTF-8, give up. */
+  if (!encoding_guess_tail_is_utf8 (data, n))
+    {
+      /* If the fallback encoding is UTF-8, fall back on something else.*/
+      if (is_encoding_utf8 (fallback_encoding))
+        return "windows-1252";
+
+      return fallback_encoding;
+    }
+
    return "ASCII";
  }
  
@@ -333,9 +350,21 @@ const char *
  encoding_guess_tail_encoding (const char *encoding,
                                const void *data, size_t n)
  {
-  return (encoding_guess_tail_is_utf8 (data, n) != 0
-          ? "UTF-8"
-          : encoding_guess_parse_encoding (encoding));
+
+  if (encoding_guess_tail_is_utf8 (data, n) != 0)
+    return "UTF-8";
+  else
+    {
+      /* The data is not UTF-8. */
+      const char *fallback_encoding = encoding_guess_parse_encoding (encoding);
+
+      /* If the fallback encoding is UTF-8, fall back on something else.*/
+      if (is_encoding_utf8 (fallback_encoding))
+        return "windows-1252";
+
+      return fallback_encoding;
+    }
+
  }
  
  /* Returns an encoding guess based on ENCODING and the N bytes of text starting