encoding-guesser: New library to guess the encoding of a text file.

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 20 Mar 2011 16:43:42 +0000 (09:43 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 20 Mar 2011 16:43:42 +0000 (09:43 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 20 Mar 2011 16:43:42 +0000 (09:43 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 20 Mar 2011 16:43:42 +0000 (09:43 -0700)
diff --git a/Smake b/Smake

index 7efa2cfb1b022ae9afee853678e43e45f9cad669..3a3235c067380f39b3ec6ae43aaa441cb90fdd90 100644 (file)
--- a/Smake
+++ b/Smake
@@ -70,10 +70,11 @@ GNULIB_MODULES = \
         sys_stat \
         tempname \
         trunc \
-       unistd \
         unictype/property-id-continue \
         unictype/property-id-start \
         unigbrk/uc-is-grapheme-break \
+       unistd \
+       unistr/u8-check \
         unistr/u8-cpy \
         unistr/u8-mbtouc \
         unistr/u8-strlen \
diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk

index 823bbb35bd8091aae9466a49214990b8ac3680f6..5cf660ace5f417cbfdd551265e62d5630a137e07 100644 (file)
--- a/src/libpspp/automake.mk
+++ b/src/libpspp/automake.mk
@@ -20,6 +20,8 @@ src_libpspp_libpspp_la_SOURCES = \
         src/libpspp/copyleft.h \
         src/libpspp/deque.c \
         src/libpspp/deque.h \
+       src/libpspp/encoding-guesser.c \
+       src/libpspp/encoding-guesser.h \
         src/libpspp/ext-array.c \
         src/libpspp/ext-array.h \
         src/libpspp/float-format.c \
diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c

new file mode 100644 (file)

index 0000000..9042e93
--- /dev/null
+++ b/src/libpspp/encoding-guesser.c
@@ -0,0 +1,289 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "libpspp/encoding-guesser.h"
+
+#include <errno.h>
+#include <iconv.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistr.h>
+
+#include "libpspp/cast.h"
+#include "libpspp/i18n.h"
+
+#include "gl/localcharset.h"
+#include "gl/c-strcase.h"
+
+/* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
+   of information about encoding detection.
+*/
+
+/* Parses and returns the fallback encoding from ENCODING, which must be in one
+   of the forms described at the top of encoding-guesser.h.  The returned
+   string might be ENCODING itself or a suffix of it, or it might be a
+   statically allocated string. */
+const char *
+encoding_guess_parse_encoding (const char *encoding)
+{
+  if (encoding == NULL
+      || !c_strcasecmp (encoding, "auto")
+      || !c_strcasecmp (encoding, "auto,locale")
+      || !c_strcasecmp (encoding, "locale"))
+    return locale_charset ();
+  else if (!c_strncasecmp (encoding, "auto,", 5))
+    return encoding + 5;
+  else
+    return encoding;
+}
+
+/* Returns true if ENCODING, which must be in one of the forms described at the
+   top of encoding-guesser.h, is one that performs encoding autodetection,
+   false otherwise. */
+bool
+encoding_guess_encoding_is_auto (const char *encoding)
+{
+  return (encoding == NULL
+          || (!c_strncasecmp (encoding, "auto", 4)
+              && (encoding[4] == ',' || encoding[4] == '\0')));
+}
+
+static uint16_t
+get_be16 (const uint8_t *data)
+{
+  return (data[0] << 8) | data[1];
+}
+
+static uint16_t
+get_le16 (const uint8_t *data)
+{
+  return (data[1] << 8) | data[0];
+}
+
+static uint32_t
+get_be32 (const uint8_t *data)
+{
+  return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
+
+}
+
+static uint32_t
+get_le32 (const uint8_t *data)
+{
+  return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
+
+}
+
+static const char *
+guess_utf16 (const uint8_t *data, size_t n)
+{
+  size_t even_nulls, odd_nulls;
+
+  if (n < ENCODING_GUESS_MIN && n % 2 != 0)
+    return NULL;
+
+  even_nulls = odd_nulls = 0;
+  while (n >= 2)
+    {
+      even_nulls += data[0] == 0;
+      odd_nulls += data[1] == 0;
+      if (data[0] == 0 && data[1] == 0)
+        return NULL;
+
+      data += 2;
+      n -= 2;
+    }
+
+  if (odd_nulls > even_nulls)
+    return "UTF-16LE";
+  else if (even_nulls > 0)
+    return "UTF-16BE";
+  else
+    return NULL;
+}
+
+static bool
+is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
+{
+  if (n < ENCODING_GUESS_MIN && n % 4 != 0)
+    return false;
+
+  while (n >= 4)
+    {
+      uint32_t uc = get_u32 (data);
+
+      if (uc < 0x09 || uc > 0x10ffff)
+        return false;
+
+      data += 4;
+      n -= 4;
+    }
+
+  return true;
+}
+
+/* Counts and returns the number of bytes, but no more than N, starting at S
+   that are ASCII text characters. */
+size_t
+encoding_guess_count_ascii (const void *s_, size_t n)
+{
+  const uint8_t *s = s_;
+  size_t ofs;
+
+  for (ofs = 0; ofs < n; ofs++)
+    if (!encoding_guess_is_ascii_text (s[ofs]))
+      break;
+  return ofs;
+}
+
+static bool
+is_all_utf8_text (const void *s_, size_t n)
+{
+  const uint8_t *s = s_;
+  size_t ofs;
+
+  ofs = 0;
+  while (ofs < n)
+    {
+      uint8_t c = s[ofs];
+      if (c < 0x80)
+        {
+          if (!encoding_guess_is_ascii_text (c))
+            return false;
+          ofs++;
+        }
+      else
+        {
+          ucs4_t uc;
+          int mblen;
+
+          mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
+          if (mblen < 0)
+            return mblen == -2;
+
+          ofs += mblen;
+        }
+    }
+  return true;
+}
+
+/* Attempts to guess the encoding of a text file based on ENCODING, an encoding
+   name in one of the forms described at the top of encoding-guesser.h, and
+   DATA, which contains the first N bytes of the file.  Returns the guessed
+   encoding, which might be ENCODING itself or a suffix of it or a statically
+   allocated string.
+
+   Encoding autodetection only takes place if ENCODING actually specifies
+   autodetection.  See encoding-guesser.h for details.
+
+   UTF-8 cannot be distinguished from other ASCII-based encodings until a
+   non-ASCII text character is encountered.  If ENCODING specifies
+   autodetection and this function returns "ASCII", then the client should
+   process the input until it encounters an non-ASCII character (as returned by
+   encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
+   to make a final encoding guess.  See encoding-guesser.h for details.
+
+   N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
+   that. */
+const char *
+encoding_guess_head_encoding (const char *encoding,
+                              const void *data_, size_t n)
+{
+  const uint8_t *data = data_;
+  const char *fallback_encoding;
+  const char *guess;
+
+  fallback_encoding = encoding_guess_parse_encoding (encoding);
+  if (!encoding_guess_encoding_is_auto (encoding))
+    return fallback_encoding;
+
+  if (n == 0)
+    return fallback_encoding;
+
+  if ((n >= ENCODING_GUESS_MIN || n % 4 == 0)
+      && (get_be32 (data) == 0xfeff || get_le32 (data) == 0xfeff))
+    return "UTF-32";
+
+  if (n >= 4)
+    {
+      uint32_t x = get_be32 (data);
+      if (x == 0x84319533)
+        return "GB-18030";
+      else if (x == 0xdd736673)
+        return "UTF-EBCDIC";
+    }
+
+  if ((n >= ENCODING_GUESS_MIN || n % 2 == 0)
+      && (get_be16 (data) == 0xfeff || get_le16 (data) == 0xfeff))
+    return "UTF-16";
+
+  if (n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf)
+    return "UTF-8";
+
+  guess = guess_utf16 (data, n);
+  if (guess != NULL)
+    return guess;
+
+  if (is_utf32 (data, n, get_be32))
+    return "UTF-32BE";
+  if (is_utf32 (data, n, get_le32))
+    return "UTF-32LE";
+
+  if (!is_encoding_ascii_compatible (fallback_encoding)
+      || !encoding_guess_tail_is_utf8 (data, n))
+    return fallback_encoding;
+
+  if (!c_strcasecmp (fallback_encoding, "UTF-8")
+      || !c_strcasecmp (fallback_encoding, "UTF8"))
+    return "UTF-8";
+
+  return "ASCII";
+}
+
+/* Returns an encoding guess based on ENCODING and the N bytes of text starting
+   at DATA.  DATA should start with the first non-ASCII text character (as
+   determined by encoding_guess_is_ascii_text()) found in the input.
+
+   The return value will either be "UTF-8" or the fallback encoding for
+   ENCODING.
+
+   See encoding-guesser.h for intended use of this function.
+
+   N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
+   that starting with the first non-ASCII text character. */
+const char *
+encoding_guess_tail_encoding (const char *encoding,
+                              const void *data, size_t n)
+{
+  return (encoding_guess_tail_is_utf8 (data, n)
+          ? "UTF-8"
+          : encoding_guess_parse_encoding (encoding));
+}
+
+/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false
+   for the fallback encoding. */
+bool
+encoding_guess_tail_is_utf8 (const void *data, size_t n)
+{
+  return (n < ENCODING_GUESS_MIN
+          ? u8_check (data, n) == NULL
+          : is_all_utf8_text (data, n));
+}
+
diff --git a/src/libpspp/encoding-guesser.h b/src/libpspp/encoding-guesser.h

new file mode 100644 (file)

index 0000000..2ec2fee
--- /dev/null
+++ b/src/libpspp/encoding-guesser.h
@@ -0,0 +1,126 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef LIBPSPP_ENCODING_GUESSER_H
+#define LIBPSPP_ENCODING_GUESSER_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* A library for autodetecting the encoding of a text file.
+
+   Naming Encodings
+   ----------------
+
+   The encoding guesser starts with an encoding name in one of various
+   different forms.  Some of the forms do not actually do any autodetection.
+   The encoding guesser will return the specified encoding without looking at
+   any file data:
+
+     - A valid IANA or system encoding name: These are returned as-is.
+
+     - "Locale": Translated to the encoding used by the system locale, as
+       returned by locale_charset().
+
+   The remaining forms that do perform autodetection are:
+
+     - "Auto," followed by a valid IANA or system encoding name (the "fallback
+       encoding"): Requests detection whether the input is encoded in UTF-8,
+       UTF-16, UTF-32, or a few other easily identifiable charsets.  When a
+       particular character set cannot be recognized, the guesser falls back to
+       the encoding following the comma.  UTF-8 detection works only for
+       ASCII-compatible character sets.
+
+     - NULL or "Auto": As above, with the encoding used by the system locale as
+       the fallback encoding.
+
+   The above are suggested capitalizations but encoding names are not
+   case-sensitive.
+
+   The encoding_guess_parse_encoding() and encoding_guess_encoding_is_auto()
+   functions work with encoding names in these forms.
+
+   Usage
+   -----
+
+   1. Call encoding_guess_head_encoding() with several bytes from the start of
+      the text file.  Feed in at least ENCODING_GUESS_MIN bytes, unless the
+      file is shorter than that, but as many more as are conveniently
+      available.  ENCODING_GUESS_SUGGESTED is a reasonable amount.
+
+      encoding_guess_head_encoding() returns its best guess at the file's
+      encoding.  Ordinarily it returns a final guess that the client can use to
+      interpret the file, and you're all done.  However, if it returns "ASCII"
+      and the original encoding name requests autodetection (which you can find
+      out by calling encoding_guess_encoding_is_auto()), then proceed to the
+      next step.
+
+   2. The encoding guesser is confident that the stream uses an ASCII
+      compatible encoding, either UTF-8 or the fallback encoding.  The client
+      may safely read and process the stream up to the first non-ASCII
+      character.  If the stream continues to be ASCII all the way to its end,
+      then we're done.
+
+      The encoding guesser provides a pair of functions to detect non-ASCII
+      characters: encoding_guess_is_ascii_text() for single characters and
+      encoding_guess_count_ascii() as a convenient wrapper for whole buffers.
+
+   3. Otherwise, the stream contains some non-ASCII data at some point.  Now
+      the client should gather several bytes starting at this point, at least
+      ENCODING_GUESS_MIN, unless the file ends before that, but as many more as
+      are conveniently available.  ENCODING_GUESS_SUGGESTED is a reasonable
+      amount.
+
+      The client should pass these bytes to encoding_guess_tail_encoding(),
+      which returns a best and final guess at the file's encoding, which is
+      either UTF-8 or the fallback encoding.  Another alternative is
+      encoding_guess_tail_is_utf8(), which guesses the same way but has a
+      different form of return value.
+*/
+
+/* Minimum number of bytes for use in autodetection.
+   You should only pass fewer bytes to the autodetection routines if the file
+   is actually shorter than this. */
+#define ENCODING_GUESS_MIN              16
+
+/* Suggested minimum buffer size to use for autodetection. */
+#define ENCODING_GUESS_SUGGESTED        1024
+
+/* Parsing encoding names. */
+const char *encoding_guess_parse_encoding (const char *encoding);
+bool encoding_guess_encoding_is_auto (const char *encoding);
+
+/* Making an initial coding guess based on the start of a file. */
+const char *encoding_guess_head_encoding (const char *encoding,
+                                          const void *, size_t);
+
+/* Refining an initial ASCII coding guess using later non-ASCII bytes. */
+static inline bool encoding_guess_is_ascii_text (uint8_t c);
+size_t encoding_guess_count_ascii (const void *, size_t);
+bool encoding_guess_tail_is_utf8 (const void *, size_t);
+const char *encoding_guess_tail_encoding (const char *encoding,
+                                          const void *, size_t);
+
+/* Returns true if C is a byte that might appear in an ASCII text file,
+   false otherwise. */
+static inline bool
+encoding_guess_is_ascii_text (uint8_t c)
+{
+  return (c >= 0x20 && c < 0x7f) || (c >= 0x09 && c < 0x0e);
+}
+
+#endif /* libpspp/encoding-guesser.h */
diff --git a/tests/automake.mk b/tests/automake.mk

index 639af7e14a0dc2a4ba5dc727089a0f282f266375..7ef7d423fabf2febea9bb051d715498987db0479 100644 (file)
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -7,6 +7,7 @@ check_PROGRAMS += \
         tests/language/lexer/command-name-test \
         tests/libpspp/abt-test \
         tests/libpspp/bt-test \
+       tests/libpspp/encoding-guesser-test \
         tests/libpspp/heap-test \
         tests/libpspp/hmap-test \
         tests/libpspp/hmapx-test \
@@ -50,6 +51,10 @@ tests_libpspp_llx_test_SOURCES = \
  tests_libpspp_llx_test_LDADD = gl/libgl.la $(LIBINTL)
  tests_libpspp_llx_test_CFLAGS = $(AM_CFLAGS)
  
+tests_libpspp_encoding_guesser_test_SOURCES = \
+       tests/libpspp/encoding-guesser-test.c
+tests_libpspp_encoding_guesser_test_LDADD = src/libpspp/libpspp.la gl/libgl.la
+
  tests_libpspp_heap_test_SOURCES = \
         src/libpspp/heap.c \
         src/libpspp/pool.c \
@@ -313,6 +318,7 @@ TESTSUITE_AT = \
         tests/language/xforms/select-if.at \
         tests/libpspp/abt.at \
         tests/libpspp/bt.at \
+       tests/libpspp/encoding-guesser.at \
         tests/libpspp/float-format.at \
         tests/libpspp/heap.at \
         tests/libpspp/hmap.at \
diff --git a/tests/libpspp/encoding-guesser-test.c b/tests/libpspp/encoding-guesser-test.c

new file mode 100644 (file)

index 0000000..a20607e
--- /dev/null
+++ b/tests/libpspp/encoding-guesser-test.c
@@ -0,0 +1,102 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "libpspp/encoding-guesser.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libpspp/i18n.h"
+
+#include "gl/error.h"
+#include "gl/progname.h"
+#include "gl/xalloc.h"
+
+static void
+usage (void)
+{
+  printf ("usage: %s [OTHER_ENCODING] [BUFSIZE] < INPUT\n"
+          "where OTHER_ENCODING is the fallback encoding (default taken\n"
+          "                     from the current locale)\n"
+          "  and BUFSIZE is the buffer size (default %d)\n",
+          program_name, ENCODING_GUESS_MIN);
+  exit (0);
+}
+
+int
+main (int argc, char *argv[])
+{
+  const char *encoding, *guess;
+  char *buffer;
+  int bufsize;
+  size_t n;
+  int i;
+
+  set_program_name (argv[0]);
+
+  i18n_init ();
+
+  encoding = NULL;
+  bufsize = 0;
+  for (i = 1; i < argc; i++)
+    {
+      const char *arg = argv[i];
+      if (!strcmp (arg, "--help"))
+        usage ();
+      else if (isdigit (arg[0]) && bufsize == 0)
+        {
+          bufsize = atoi (arg);
+          if (bufsize < ENCODING_GUESS_MIN)
+            error (1, 0, "buffer size %s is less than minimum size %d",
+                   arg, ENCODING_GUESS_MIN);
+        }
+      else if (!isdigit (arg[0]) && encoding == NULL)
+        encoding = arg;
+      else
+        error (1, 0, "bad syntax; use `%s --help' for help", program_name);
+    }
+
+  if (bufsize == 0)
+    bufsize = ENCODING_GUESS_MIN;
+
+  buffer = xmalloc (bufsize);
+
+  n = fread (buffer, 1, bufsize, stdin);
+  guess = encoding_guess_head_encoding (encoding, buffer, n);
+  if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
+    while (n > 0)
+      {
+        size_t n_ascii = encoding_guess_count_ascii (buffer, n);
+        if (n == n_ascii)
+          n = fread (buffer, 1, bufsize, stdin);
+        else
+          {
+            memmove (buffer, buffer + n_ascii, n - n_ascii);
+            n -= n_ascii;
+            n += fread (buffer + n, 1, bufsize - n, stdin);
+
+            guess = encoding_guess_tail_encoding (encoding, buffer, n);
+            break;
+          }
+      }
+  puts (guess);
+
+  return 0;
+}
diff --git a/tests/libpspp/encoding-guesser.at b/tests/libpspp/encoding-guesser.at

new file mode 100644 (file)

index 0000000..d63dc37
--- /dev/null
+++ b/tests/libpspp/encoding-guesser.at
@@ -0,0 +1,143 @@
+AT_BANNER([encoding guesser])
+
+AT_SETUP([ASCII])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([echo string | encoding-guesser-test Auto,ISO-8859-1], [0], [ASCII
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-8])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf '\346\227\245\346\234\254\350\252\236\n' | encoding-guesser-test Auto,ISO-8859-1], [0], [UTF-8
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-8 starting with ASCII])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\346\227\245\346\234\254\350\252\236\n' | encoding-guesser-test Auto,ISO-8859-1 32], [0], [UTF-8
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16 with big-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\376\377' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-16
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16 with little-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\377\376' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-16
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16BE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0e\0n\0t\0r\0\351\0e\0\n' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-16BE
+])
+AT_CLEANUP
+
+dnl Unicode U+XX00 characters are confusing in UTF-16 because they look
+dnl likely to be of the opposite endianness, so this tests for proper handling.
+AT_SETUP([UTF-16BE starting with U+0100])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\1\0\0e\0n\0t\0r\0\351\0e\0\n' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-16BE
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16LE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf 'e\0n\0t\0r\0\351\0e\0\n\0' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-16LE
+])
+AT_CLEANUP
+
+dnl Unicode U+XX00 characters are confusing in UTF-16 because they look
+dnl likely to be of the opposite endianness, so this tests for proper handling.
+AT_SETUP([UTF-16LE starting with U+0100])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0\1e\0n\0t\0r\0\351\0e\0\n\0' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-16LE
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32 with big-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0\0\376\377' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-32
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32 with little-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\377\376\0\0' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-32
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32BE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0\0\0e\0\0\0n\0\0\0t\0\0\0r\0\0\0\351\0\0\0e\0\0\0\n' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-32BE
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32LE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf 'e\0\0\0n\0\0\0t\0\0\0r\0\0\0\351\0\0\0e\0\0\0\n\0\0\0' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-32LE
+])
+AT_CLEANUP
+
+AT_SETUP([ISO-8859-1])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf 'entr\351e\n' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [ISO-8859-1
+])
+AT_CLEANUP
+
+AT_SETUP([GB-18030 with byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf '\204\061\225\063' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [GB-18030
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-EBCDIC with byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf '\335\163\146\163' | encoding-guesser-test Auto,ISO-8859-1], 
+  [0], [UTF-EBCDIC
+])
+AT_CLEANUP
+
+AT_SETUP([EUC-JP as Auto,EUC-JP])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings EUC-JP])
+AT_CHECK([printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251 \244\252\n' | encoding-guesser-test Auto,EUC-JP],
+  [0], [EUC-JP
+])
+AT_CLEANUP
+
+AT_SETUP([EUC-JP starting with ASCII as Auto,EUC-JP])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings EUC-JP])
+AT_CHECK([printf 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251 \244\252\n' | encoding-guesser-test Auto,EUC-JP 32],
+  [0], [EUC-JP
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-8 with character split across input buffers])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n' | encoding-guesser-test Auto,ISO-8859-1 32],
+  [0], [UTF-8
+])
+AT_CLEANUP
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 20 Mar 2011 16:43:42 +0000 (09:43 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 20 Mar 2011 16:43:42 +0000 (09:43 -0700)
Smake		patch \| blob \| history
src/libpspp/automake.mk		patch \| blob \| history
src/libpspp/encoding-guesser.c	[new file with mode: 0644]	patch \| blob
src/libpspp/encoding-guesser.h	[new file with mode: 0644]	patch \| blob
tests/automake.mk		patch \| blob \| history
tests/libpspp/encoding-guesser-test.c	[new file with mode: 0644]	patch \| blob
tests/libpspp/encoding-guesser.at	[new file with mode: 0644]	patch \| blob