From f3668539947d5baed813a4f8436d6cf36abeedd2 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 20 Mar 2011 09:43:42 -0700 Subject: [PATCH] encoding-guesser: New library to guess the encoding of a text file. This will be used by other new libraries in upcoming commits. --- Smake | 3 +- src/libpspp/automake.mk | 2 + src/libpspp/encoding-guesser.c | 289 ++++++++++++++++++++++++++ src/libpspp/encoding-guesser.h | 126 +++++++++++ tests/automake.mk | 6 + tests/libpspp/encoding-guesser-test.c | 102 +++++++++ tests/libpspp/encoding-guesser.at | 143 +++++++++++++ 7 files changed, 670 insertions(+), 1 deletion(-) create mode 100644 src/libpspp/encoding-guesser.c create mode 100644 src/libpspp/encoding-guesser.h create mode 100644 tests/libpspp/encoding-guesser-test.c create mode 100644 tests/libpspp/encoding-guesser.at diff --git a/Smake b/Smake index 7efa2cfb..3a3235c0 100644 --- a/Smake +++ b/Smake @@ -70,10 +70,11 @@ GNULIB_MODULES = \ sys_stat \ tempname \ trunc \ - unistd \ unictype/property-id-continue \ unictype/property-id-start \ unigbrk/uc-is-grapheme-break \ + unistd \ + unistr/u8-check \ unistr/u8-cpy \ unistr/u8-mbtouc \ unistr/u8-strlen \ diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk index 823bbb35..5cf660ac 100644 --- a/src/libpspp/automake.mk +++ b/src/libpspp/automake.mk @@ -20,6 +20,8 @@ src_libpspp_libpspp_la_SOURCES = \ src/libpspp/copyleft.h \ src/libpspp/deque.c \ src/libpspp/deque.h \ + src/libpspp/encoding-guesser.c \ + src/libpspp/encoding-guesser.h \ src/libpspp/ext-array.c \ src/libpspp/ext-array.h \ src/libpspp/float-format.c \ diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c new file mode 100644 index 00000000..9042e93a --- /dev/null +++ b/src/libpspp/encoding-guesser.c @@ -0,0 +1,289 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "libpspp/encoding-guesser.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "libpspp/cast.h" +#include "libpspp/i18n.h" + +#include "gl/localcharset.h" +#include "gl/c-strcase.h" + +/* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source + of information about encoding detection. +*/ + +/* Parses and returns the fallback encoding from ENCODING, which must be in one + of the forms described at the top of encoding-guesser.h. The returned + string might be ENCODING itself or a suffix of it, or it might be a + statically allocated string. */ +const char * +encoding_guess_parse_encoding (const char *encoding) +{ + if (encoding == NULL + || !c_strcasecmp (encoding, "auto") + || !c_strcasecmp (encoding, "auto,locale") + || !c_strcasecmp (encoding, "locale")) + return locale_charset (); + else if (!c_strncasecmp (encoding, "auto,", 5)) + return encoding + 5; + else + return encoding; +} + +/* Returns true if ENCODING, which must be in one of the forms described at the + top of encoding-guesser.h, is one that performs encoding autodetection, + false otherwise. */ +bool +encoding_guess_encoding_is_auto (const char *encoding) +{ + return (encoding == NULL + || (!c_strncasecmp (encoding, "auto", 4) + && (encoding[4] == ',' || encoding[4] == '\0'))); +} + +static uint16_t +get_be16 (const uint8_t *data) +{ + return (data[0] << 8) | data[1]; +} + +static uint16_t +get_le16 (const uint8_t *data) +{ + return (data[1] << 8) | data[0]; +} + +static uint32_t +get_be32 (const uint8_t *data) +{ + return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3]; + +} + +static uint32_t +get_le32 (const uint8_t *data) +{ + return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0]; + +} + +static const char * +guess_utf16 (const uint8_t *data, size_t n) +{ + size_t even_nulls, odd_nulls; + + if (n < ENCODING_GUESS_MIN && n % 2 != 0) + return NULL; + + even_nulls = odd_nulls = 0; + while (n >= 2) + { + even_nulls += data[0] == 0; + odd_nulls += data[1] == 0; + if (data[0] == 0 && data[1] == 0) + return NULL; + + data += 2; + n -= 2; + } + + if (odd_nulls > even_nulls) + return "UTF-16LE"; + else if (even_nulls > 0) + return "UTF-16BE"; + else + return NULL; +} + +static bool +is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *)) +{ + if (n < ENCODING_GUESS_MIN && n % 4 != 0) + return false; + + while (n >= 4) + { + uint32_t uc = get_u32 (data); + + if (uc < 0x09 || uc > 0x10ffff) + return false; + + data += 4; + n -= 4; + } + + return true; +} + +/* Counts and returns the number of bytes, but no more than N, starting at S + that are ASCII text characters. */ +size_t +encoding_guess_count_ascii (const void *s_, size_t n) +{ + const uint8_t *s = s_; + size_t ofs; + + for (ofs = 0; ofs < n; ofs++) + if (!encoding_guess_is_ascii_text (s[ofs])) + break; + return ofs; +} + +static bool +is_all_utf8_text (const void *s_, size_t n) +{ + const uint8_t *s = s_; + size_t ofs; + + ofs = 0; + while (ofs < n) + { + uint8_t c = s[ofs]; + if (c < 0x80) + { + if (!encoding_guess_is_ascii_text (c)) + return false; + ofs++; + } + else + { + ucs4_t uc; + int mblen; + + mblen = u8_mbtoucr (&uc, s + ofs, n - ofs); + if (mblen < 0) + return mblen == -2; + + ofs += mblen; + } + } + return true; +} + +/* Attempts to guess the encoding of a text file based on ENCODING, an encoding + name in one of the forms described at the top of encoding-guesser.h, and + DATA, which contains the first N bytes of the file. Returns the guessed + encoding, which might be ENCODING itself or a suffix of it or a statically + allocated string. + + Encoding autodetection only takes place if ENCODING actually specifies + autodetection. See encoding-guesser.h for details. + + UTF-8 cannot be distinguished from other ASCII-based encodings until a + non-ASCII text character is encountered. If ENCODING specifies + autodetection and this function returns "ASCII", then the client should + process the input until it encounters an non-ASCII character (as returned by + encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding() + to make a final encoding guess. See encoding-guesser.h for details. + + N must be at least ENCODING_GUESS_MIN, unless the file is shorter than + that. */ +const char * +encoding_guess_head_encoding (const char *encoding, + const void *data_, size_t n) +{ + const uint8_t *data = data_; + const char *fallback_encoding; + const char *guess; + + fallback_encoding = encoding_guess_parse_encoding (encoding); + if (!encoding_guess_encoding_is_auto (encoding)) + return fallback_encoding; + + if (n == 0) + return fallback_encoding; + + if ((n >= ENCODING_GUESS_MIN || n % 4 == 0) + && (get_be32 (data) == 0xfeff || get_le32 (data) == 0xfeff)) + return "UTF-32"; + + if (n >= 4) + { + uint32_t x = get_be32 (data); + if (x == 0x84319533) + return "GB-18030"; + else if (x == 0xdd736673) + return "UTF-EBCDIC"; + } + + if ((n >= ENCODING_GUESS_MIN || n % 2 == 0) + && (get_be16 (data) == 0xfeff || get_le16 (data) == 0xfeff)) + return "UTF-16"; + + if (n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) + return "UTF-8"; + + guess = guess_utf16 (data, n); + if (guess != NULL) + return guess; + + if (is_utf32 (data, n, get_be32)) + return "UTF-32BE"; + if (is_utf32 (data, n, get_le32)) + return "UTF-32LE"; + + if (!is_encoding_ascii_compatible (fallback_encoding) + || !encoding_guess_tail_is_utf8 (data, n)) + return fallback_encoding; + + if (!c_strcasecmp (fallback_encoding, "UTF-8") + || !c_strcasecmp (fallback_encoding, "UTF8")) + return "UTF-8"; + + return "ASCII"; +} + +/* Returns an encoding guess based on ENCODING and the N bytes of text starting + at DATA. DATA should start with the first non-ASCII text character (as + determined by encoding_guess_is_ascii_text()) found in the input. + + The return value will either be "UTF-8" or the fallback encoding for + ENCODING. + + See encoding-guesser.h for intended use of this function. + + N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than + that starting with the first non-ASCII text character. */ +const char * +encoding_guess_tail_encoding (const char *encoding, + const void *data, size_t n) +{ + return (encoding_guess_tail_is_utf8 (data, n) + ? "UTF-8" + : encoding_guess_parse_encoding (encoding)); +} + +/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false + for the fallback encoding. */ +bool +encoding_guess_tail_is_utf8 (const void *data, size_t n) +{ + return (n < ENCODING_GUESS_MIN + ? u8_check (data, n) == NULL + : is_all_utf8_text (data, n)); +} + diff --git a/src/libpspp/encoding-guesser.h b/src/libpspp/encoding-guesser.h new file mode 100644 index 00000000..2ec2fee2 --- /dev/null +++ b/src/libpspp/encoding-guesser.h @@ -0,0 +1,126 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef LIBPSPP_ENCODING_GUESSER_H +#define LIBPSPP_ENCODING_GUESSER_H 1 + +#include +#include +#include + +/* A library for autodetecting the encoding of a text file. + + Naming Encodings + ---------------- + + The encoding guesser starts with an encoding name in one of various + different forms. Some of the forms do not actually do any autodetection. + The encoding guesser will return the specified encoding without looking at + any file data: + + - A valid IANA or system encoding name: These are returned as-is. + + - "Locale": Translated to the encoding used by the system locale, as + returned by locale_charset(). + + The remaining forms that do perform autodetection are: + + - "Auto," followed by a valid IANA or system encoding name (the "fallback + encoding"): Requests detection whether the input is encoded in UTF-8, + UTF-16, UTF-32, or a few other easily identifiable charsets. When a + particular character set cannot be recognized, the guesser falls back to + the encoding following the comma. UTF-8 detection works only for + ASCII-compatible character sets. + + - NULL or "Auto": As above, with the encoding used by the system locale as + the fallback encoding. + + The above are suggested capitalizations but encoding names are not + case-sensitive. + + The encoding_guess_parse_encoding() and encoding_guess_encoding_is_auto() + functions work with encoding names in these forms. + + Usage + ----- + + 1. Call encoding_guess_head_encoding() with several bytes from the start of + the text file. Feed in at least ENCODING_GUESS_MIN bytes, unless the + file is shorter than that, but as many more as are conveniently + available. ENCODING_GUESS_SUGGESTED is a reasonable amount. + + encoding_guess_head_encoding() returns its best guess at the file's + encoding. Ordinarily it returns a final guess that the client can use to + interpret the file, and you're all done. However, if it returns "ASCII" + and the original encoding name requests autodetection (which you can find + out by calling encoding_guess_encoding_is_auto()), then proceed to the + next step. + + 2. The encoding guesser is confident that the stream uses an ASCII + compatible encoding, either UTF-8 or the fallback encoding. The client + may safely read and process the stream up to the first non-ASCII + character. If the stream continues to be ASCII all the way to its end, + then we're done. + + The encoding guesser provides a pair of functions to detect non-ASCII + characters: encoding_guess_is_ascii_text() for single characters and + encoding_guess_count_ascii() as a convenient wrapper for whole buffers. + + 3. Otherwise, the stream contains some non-ASCII data at some point. Now + the client should gather several bytes starting at this point, at least + ENCODING_GUESS_MIN, unless the file ends before that, but as many more as + are conveniently available. ENCODING_GUESS_SUGGESTED is a reasonable + amount. + + The client should pass these bytes to encoding_guess_tail_encoding(), + which returns a best and final guess at the file's encoding, which is + either UTF-8 or the fallback encoding. Another alternative is + encoding_guess_tail_is_utf8(), which guesses the same way but has a + different form of return value. +*/ + +/* Minimum number of bytes for use in autodetection. + You should only pass fewer bytes to the autodetection routines if the file + is actually shorter than this. */ +#define ENCODING_GUESS_MIN 16 + +/* Suggested minimum buffer size to use for autodetection. */ +#define ENCODING_GUESS_SUGGESTED 1024 + +/* Parsing encoding names. */ +const char *encoding_guess_parse_encoding (const char *encoding); +bool encoding_guess_encoding_is_auto (const char *encoding); + +/* Making an initial coding guess based on the start of a file. */ +const char *encoding_guess_head_encoding (const char *encoding, + const void *, size_t); + +/* Refining an initial ASCII coding guess using later non-ASCII bytes. */ +static inline bool encoding_guess_is_ascii_text (uint8_t c); +size_t encoding_guess_count_ascii (const void *, size_t); +bool encoding_guess_tail_is_utf8 (const void *, size_t); +const char *encoding_guess_tail_encoding (const char *encoding, + const void *, size_t); + +/* Returns true if C is a byte that might appear in an ASCII text file, + false otherwise. */ +static inline bool +encoding_guess_is_ascii_text (uint8_t c) +{ + return (c >= 0x20 && c < 0x7f) || (c >= 0x09 && c < 0x0e); +} + +#endif /* libpspp/encoding-guesser.h */ diff --git a/tests/automake.mk b/tests/automake.mk index 639af7e1..7ef7d423 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -7,6 +7,7 @@ check_PROGRAMS += \ tests/language/lexer/command-name-test \ tests/libpspp/abt-test \ tests/libpspp/bt-test \ + tests/libpspp/encoding-guesser-test \ tests/libpspp/heap-test \ tests/libpspp/hmap-test \ tests/libpspp/hmapx-test \ @@ -50,6 +51,10 @@ tests_libpspp_llx_test_SOURCES = \ tests_libpspp_llx_test_LDADD = gl/libgl.la $(LIBINTL) tests_libpspp_llx_test_CFLAGS = $(AM_CFLAGS) +tests_libpspp_encoding_guesser_test_SOURCES = \ + tests/libpspp/encoding-guesser-test.c +tests_libpspp_encoding_guesser_test_LDADD = src/libpspp/libpspp.la gl/libgl.la + tests_libpspp_heap_test_SOURCES = \ src/libpspp/heap.c \ src/libpspp/pool.c \ @@ -313,6 +318,7 @@ TESTSUITE_AT = \ tests/language/xforms/select-if.at \ tests/libpspp/abt.at \ tests/libpspp/bt.at \ + tests/libpspp/encoding-guesser.at \ tests/libpspp/float-format.at \ tests/libpspp/heap.at \ tests/libpspp/hmap.at \ diff --git a/tests/libpspp/encoding-guesser-test.c b/tests/libpspp/encoding-guesser-test.c new file mode 100644 index 00000000..a20607e1 --- /dev/null +++ b/tests/libpspp/encoding-guesser-test.c @@ -0,0 +1,102 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "libpspp/encoding-guesser.h" + +#include +#include +#include +#include + +#include "libpspp/i18n.h" + +#include "gl/error.h" +#include "gl/progname.h" +#include "gl/xalloc.h" + +static void +usage (void) +{ + printf ("usage: %s [OTHER_ENCODING] [BUFSIZE] < INPUT\n" + "where OTHER_ENCODING is the fallback encoding (default taken\n" + " from the current locale)\n" + " and BUFSIZE is the buffer size (default %d)\n", + program_name, ENCODING_GUESS_MIN); + exit (0); +} + +int +main (int argc, char *argv[]) +{ + const char *encoding, *guess; + char *buffer; + int bufsize; + size_t n; + int i; + + set_program_name (argv[0]); + + i18n_init (); + + encoding = NULL; + bufsize = 0; + for (i = 1; i < argc; i++) + { + const char *arg = argv[i]; + if (!strcmp (arg, "--help")) + usage (); + else if (isdigit (arg[0]) && bufsize == 0) + { + bufsize = atoi (arg); + if (bufsize < ENCODING_GUESS_MIN) + error (1, 0, "buffer size %s is less than minimum size %d", + arg, ENCODING_GUESS_MIN); + } + else if (!isdigit (arg[0]) && encoding == NULL) + encoding = arg; + else + error (1, 0, "bad syntax; use `%s --help' for help", program_name); + } + + if (bufsize == 0) + bufsize = ENCODING_GUESS_MIN; + + buffer = xmalloc (bufsize); + + n = fread (buffer, 1, bufsize, stdin); + guess = encoding_guess_head_encoding (encoding, buffer, n); + if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding)) + while (n > 0) + { + size_t n_ascii = encoding_guess_count_ascii (buffer, n); + if (n == n_ascii) + n = fread (buffer, 1, bufsize, stdin); + else + { + memmove (buffer, buffer + n_ascii, n - n_ascii); + n -= n_ascii; + n += fread (buffer + n, 1, bufsize - n, stdin); + + guess = encoding_guess_tail_encoding (encoding, buffer, n); + break; + } + } + puts (guess); + + return 0; +} diff --git a/tests/libpspp/encoding-guesser.at b/tests/libpspp/encoding-guesser.at new file mode 100644 index 00000000..d63dc37e --- /dev/null +++ b/tests/libpspp/encoding-guesser.at @@ -0,0 +1,143 @@ +AT_BANNER([encoding guesser]) + +AT_SETUP([ASCII]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([echo string | encoding-guesser-test Auto,ISO-8859-1], [0], [ASCII +]) +AT_CLEANUP + +AT_SETUP([UTF-8]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings ISO-8859-1]) +AT_CHECK([printf '\346\227\245\346\234\254\350\252\236\n' | encoding-guesser-test Auto,ISO-8859-1], [0], [UTF-8 +]) +AT_CLEANUP + +AT_SETUP([UTF-8 starting with ASCII]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings ISO-8859-1]) +AT_CHECK([printf 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\346\227\245\346\234\254\350\252\236\n' | encoding-guesser-test Auto,ISO-8859-1 32], [0], [UTF-8 +]) +AT_CLEANUP + +AT_SETUP([UTF-16 with big-endian byte order mark]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\376\377' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-16 +]) +AT_CLEANUP + +AT_SETUP([UTF-16 with little-endian byte order mark]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\377\376' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-16 +]) +AT_CLEANUP + +AT_SETUP([UTF-16BE]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\0e\0n\0t\0r\0\351\0e\0\n' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-16BE +]) +AT_CLEANUP + +dnl Unicode U+XX00 characters are confusing in UTF-16 because they look +dnl likely to be of the opposite endianness, so this tests for proper handling. +AT_SETUP([UTF-16BE starting with U+0100]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\1\0\0e\0n\0t\0r\0\351\0e\0\n' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-16BE +]) +AT_CLEANUP + +AT_SETUP([UTF-16LE]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf 'e\0n\0t\0r\0\351\0e\0\n\0' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-16LE +]) +AT_CLEANUP + +dnl Unicode U+XX00 characters are confusing in UTF-16 because they look +dnl likely to be of the opposite endianness, so this tests for proper handling. +AT_SETUP([UTF-16LE starting with U+0100]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\0\1e\0n\0t\0r\0\351\0e\0\n\0' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-16LE +]) +AT_CLEANUP + +AT_SETUP([UTF-32 with big-endian byte order mark]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\0\0\376\377' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-32 +]) +AT_CLEANUP + +AT_SETUP([UTF-32 with little-endian byte order mark]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\377\376\0\0' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-32 +]) +AT_CLEANUP + +AT_SETUP([UTF-32BE]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf '\0\0\0e\0\0\0n\0\0\0t\0\0\0r\0\0\0\351\0\0\0e\0\0\0\n' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-32BE +]) +AT_CLEANUP + +AT_SETUP([UTF-32LE]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([printf 'e\0\0\0n\0\0\0t\0\0\0r\0\0\0\351\0\0\0e\0\0\0\n\0\0\0' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-32LE +]) +AT_CLEANUP + +AT_SETUP([ISO-8859-1]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings ISO-8859-1]) +AT_CHECK([printf 'entr\351e\n' | encoding-guesser-test Auto,ISO-8859-1], + [0], [ISO-8859-1 +]) +AT_CLEANUP + +AT_SETUP([GB-18030 with byte order mark]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings ISO-8859-1]) +AT_CHECK([printf '\204\061\225\063' | encoding-guesser-test Auto,ISO-8859-1], + [0], [GB-18030 +]) +AT_CLEANUP + +AT_SETUP([UTF-EBCDIC with byte order mark]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings ISO-8859-1]) +AT_CHECK([printf '\335\163\146\163' | encoding-guesser-test Auto,ISO-8859-1], + [0], [UTF-EBCDIC +]) +AT_CLEANUP + +AT_SETUP([EUC-JP as Auto,EUC-JP]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings EUC-JP]) +AT_CHECK([printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251 \244\252\n' | encoding-guesser-test Auto,EUC-JP], + [0], [EUC-JP +]) +AT_CLEANUP + +AT_SETUP([EUC-JP starting with ASCII as Auto,EUC-JP]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings EUC-JP]) +AT_CHECK([printf 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251 \244\252\n' | encoding-guesser-test Auto,EUC-JP 32], + [0], [EUC-JP +]) +AT_CLEANUP + +AT_SETUP([UTF-8 with character split across input buffers]) +AT_KEYWORDS([encoding guesser]) +AT_CHECK([supports_encodings ISO-8859-1]) +AT_CHECK([printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n' | encoding-guesser-test Auto,ISO-8859-1 32], + [0], [UTF-8 +]) +AT_CLEANUP -- 2.30.2