From d3e294c031bb767336435d2f0048994103fcd47a Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 19 Mar 2011 16:34:53 -0700 Subject: [PATCH] u8-istream: New library for reading a text file and recoding to UTF-8. This new library will be used in an upcoming commit. --- src/libpspp/automake.mk | 2 + src/libpspp/u8-istream.c | 475 ++++++++++++++++++++++++++++++++ src/libpspp/u8-istream.h | 45 +++ tests/automake.mk | 5 + tests/libpspp/u8-istream-test.c | 126 +++++++++ tests/libpspp/u8-istream.at | 142 ++++++++++ 6 files changed, 795 insertions(+) create mode 100644 src/libpspp/u8-istream.c create mode 100644 src/libpspp/u8-istream.h create mode 100644 tests/libpspp/u8-istream-test.c create mode 100644 tests/libpspp/u8-istream.at diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk index 5cf660ac..a7c92830 100644 --- a/src/libpspp/automake.mk +++ b/src/libpspp/automake.mk @@ -88,6 +88,8 @@ src_libpspp_libpspp_la_SOURCES = \ src/libpspp/temp-file.h \ src/libpspp/tower.c \ src/libpspp/tower.h \ + src/libpspp/u8-istream.c \ + src/libpspp/u8-istream.h \ src/libpspp/version.h \ src/libpspp/zip-writer.c \ src/libpspp/zip-writer.h diff --git a/src/libpspp/u8-istream.c b/src/libpspp/u8-istream.c new file mode 100644 index 00000000..6d4d7707 --- /dev/null +++ b/src/libpspp/u8-istream.c @@ -0,0 +1,475 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "u8-istream.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpspp/assertion.h" +#include "libpspp/cast.h" +#include "libpspp/compiler.h" +#include "libpspp/encoding-guesser.h" + +#include "gl/c-strcase.h" +#include "gl/localcharset.h" +#include "gl/minmax.h" + +enum u8_istream_state + { + S_AUTO, /* Stream encoding not yet known. */ + S_UTF8, /* Stream encoding is known to be UTF-8. */ + S_CONVERT /* Stream encoding is known but not UTF-8. */ + }; + +struct u8_istream + { + int fd; + iconv_t converter; + enum u8_istream_state state; + + char *buffer; + char *head; + size_t length; + + char outbuf[4]; + size_t outlen; + }; + +static ssize_t fill_buffer (struct u8_istream *); + +/* Opens FILENAME, which is encoded in FROMCODE, for reading as an UTF-8 + stream, passing FLAGS to the open() function. Returns a new u8_istream if + successful, otherwise returns NULL and sets errno to an appropriate value. + + The accepted forms for FROMCODE are listed at the top of + encoding-guesser.h. */ +struct u8_istream * +u8_istream_for_file (const char *fromcode, const char *filename, int flags) +{ + struct u8_istream *is; + int fd; + + assert (!(flags & O_CREAT)); + + fd = open (filename, flags); + if (fd < 0) + return NULL; + + is = u8_istream_for_fd (fromcode, fd); + if (is == NULL) + { + int save_errno = errno; + close (fd); + errno = save_errno; + } + + return is; +} + +/* Creates and returns a new u8_istream that reads its input from FD. Returns + a new u8_istream if successful, otherwise returns NULL and sets errno to an + appropriate value. + + The accepted forms for FROMCODE are listed at the top of + encoding-guesser.h. */ +struct u8_istream * +u8_istream_for_fd (const char *fromcode, int fd) +{ + struct u8_istream *is; + const char *encoding; + + is = malloc (sizeof *is); + if (is == NULL) + return NULL; + + is->fd = fd; + is->converter = (iconv_t) -1; + is->buffer = malloc (U8_ISTREAM_BUFFER_SIZE); + if (is->buffer == NULL) + goto error; + is->head = is->buffer; + is->length = 0; + is->outlen = 0; + + if (fill_buffer (is) < 0) + goto error; + + encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length); + if (!strcmp (encoding, "UTF-8")) + is->state = S_UTF8; + else + { + if (encoding_guess_encoding_is_auto (fromcode) + && !strcmp (encoding, "ASCII")) + is->state = S_AUTO; + else + is->state = S_CONVERT; + + is->converter = iconv_open ("UTF-8", + encoding_guess_parse_encoding (fromcode)); + if (is->converter == (iconv_t) -1) + goto error; + } + + return is; + +error: + u8_istream_free (is); + return NULL; +} + +/* Closes IS and its underlying file descriptor and frees all associated + resources. Returns the return value from close(). */ +int +u8_istream_close (struct u8_istream *is) +{ + if (is != NULL) + { + int fd = is->fd; + u8_istream_free (is); + return close (fd); + } + return 0; +} + +/* Frees IS and associated resources, but does not close the underlying file + descriptor. (Thus, the client must close the file descriptor when it is no + longer needed.) */ +void +u8_istream_free (struct u8_istream *is) +{ + if (is != NULL) + { + if (is->converter != (iconv_t) -1) + iconv_close (is->converter); + free (is->buffer); + free (is); + } +} + +static void +substitute_invalid_input_byte (struct u8_istream *is) +{ + assert (is->outlen == 0); + is->head++; + is->length--; + is->outlen = u8_uctomb (CHAR_CAST (uint8_t *, is->outbuf), + 0xfffd, sizeof is->outbuf); +} + +static ssize_t +fill_buffer (struct u8_istream *is) +{ + ssize_t n; + + /* Move any unused bytes to the beginning of the input buffer. */ + if (is->length > 0 && is->buffer != is->head) + memmove (is->buffer, is->head, is->length); + is->head = is->buffer; + + /* Read more input. */ + n = read (is->fd, is->buffer + is->length, + U8_ISTREAM_BUFFER_SIZE - is->length); + if (n > 0) + is->length += n; + return n; +} + +static ssize_t +read_auto (struct u8_istream *is, char *buffer, size_t size) +{ + size_t original_size = size; + int retval = 0; + + while (size > 0) + { + if (is->length > 0) + { + size_t n_ascii; + + n_ascii = encoding_guess_count_ascii (is->head, + MIN (is->length, size)); + + memcpy (buffer, is->head, n_ascii); + buffer += n_ascii; + size -= n_ascii; + + is->head += n_ascii; + is->length -= n_ascii; + + if (size == 0) + break; + } + + if (is->length == 0) + { + retval = fill_buffer (is); + if (retval > 0) + continue; + else + break; + } + + /* is->head points to a byte that isn't a printable ASCII character. + Fill up the buffer and check for UTF-8. */ + fill_buffer (is); + is->state = (encoding_guess_tail_is_utf8 (is->head, is->length) + ? S_UTF8 : S_CONVERT); + if (size == original_size) + return u8_istream_read (is, buffer, size); + break; + } + + return original_size - size; +} + +static int +convert_iconv (iconv_t converter, + char **inbufp, size_t *inbytesleft, + char **outbufp, size_t *outbytesleft) +{ + size_t n = iconv (converter, inbufp, inbytesleft, outbufp, outbytesleft); + return n == SIZE_MAX ? errno : 0; +} + +static int +convert_utf8 (iconv_t converter UNUSED, + char **inbufp, size_t *inbytesleft, + char **outbufp, size_t *outbytesleft) +{ + const uint8_t *in = CHAR_CAST (const uint8_t *, *inbufp); + size_t n = MIN (*inbytesleft, *outbytesleft); + size_t ofs = 0; + int error; + + for (;;) + { + ucs4_t uc; + int mblen; + + if (ofs >= n) + { + error = ofs < *inbytesleft ? E2BIG : 0; + break; + } + + mblen = u8_mbtouc (&uc, in + ofs, n - ofs); + if (uc == 0xfffd) + { + int retval = u8_mbtoucr (&uc, in + ofs, *inbytesleft - ofs); + if (retval == mblen) + { + /* There's an actual U+FFFD in the input stream. Carry on. */ + } + else + { + error = (retval == -1 ? EILSEQ + : retval == -2 ? EINVAL + : E2BIG); + break; + } + } + + ofs += mblen; + } + + if (ofs > 0) + { + memcpy (*outbufp, *inbufp, ofs); + *inbufp += ofs; + *inbytesleft -= ofs; + *outbufp += ofs; + *outbytesleft -= ofs; + } + + return error; +} + +static ssize_t +read_convert (struct u8_istream *is, + int (*convert) (iconv_t converter, + char **inbufp, size_t *inbytesleft, + char **outbufp, size_t *outbytesleft), + char *buffer, size_t size) +{ + size_t original_size = size; + + while (size > 0) + { + ssize_t n_read; + + if (is->outlen > 0) + { + size_t n = MIN (size, is->outlen); + + memcpy (buffer, is->outbuf, n); + is->outlen -= n; + if (is->outlen > 0) + memmove (is->outbuf, is->outbuf + n, is->outlen); + + buffer += n; + size -= n; + + if (size == 0) + break; + } + + if (is->length) + { + int error = convert (is->converter, + &is->head, &is->length, + &buffer, &size); + if (size == 0) + break; + + switch (error) + { + case 0: + /* Converted all of the input into output, possibly with space + for output left over. + + Read more input. */ + break; + + case EILSEQ: + substitute_invalid_input_byte (is); + continue; + + case EINVAL: + /* Incomplete byte sequence at end of input. Read more + input. */ + break; + + default: + /* A real error of some kind (ENOMEM?). */ + return -1; + + case E2BIG: + /* Ran out of room for output. + Convert into outbuf and copy from there instead. */ + { + char *outptr = is->outbuf; + size_t outleft = sizeof is->outbuf; + + error = convert (is->converter, + &is->head, &is->length, + &outptr, &outleft); + is->outlen = outptr - is->outbuf; + if (is->outlen > 0) + continue; + + switch (error) + { + case EILSEQ: + substitute_invalid_input_byte (is); + continue; + + case E2BIG: + case EINVAL: + continue; + + default: + /* A real error of some kind (ENOMEM?). */ + return -1; + } + } + } + } + + assert (is->length <= MB_LEN_MAX); + n_read = fill_buffer (is); + if (n_read <= 0) + { + if (original_size != size) + { + /* We produced some output so don't report EOF or error yet. */ + break; + } + else if (n_read == 0 && is->length != 0) + { + /* Incomplete byte sequence at end of file. */ + substitute_invalid_input_byte (is); + } + else + { + /* Propagate end-of-file or error to caller. */ + return n_read; + } + } + } + + return original_size - size; +} + +/* Reads up to SIZE bytes of UTF-8 text from IS into BUFFER. Returns the + number of bytes read if successful, 0 at end of file, or -1 if an error + occurred before any data could be read. Upon error, sets errno to an + appropriate value. */ +ssize_t +u8_istream_read (struct u8_istream *is, char *buffer, size_t size) +{ + switch (is->state) + { + case S_CONVERT: + return read_convert (is, convert_iconv, buffer, size); + + case S_AUTO: + return read_auto (is, buffer, size); + + case S_UTF8: + return read_convert (is, convert_utf8, buffer, size); + } + + NOT_REACHED (); +} + +/* Returns the file descriptor underlying IS. */ +int +u8_istream_fileno (const struct u8_istream *is) +{ + return is->fd; +} + +/* Test functions. + + These functions are probably useful only for white-box testing. */ + +/* Returns true if the encoding of the file being read by IS is not yet + known. */ +bool +u8_istream_is_auto (const struct u8_istream *is) +{ + return is->state == S_AUTO; +} + +/* Returns true if the encoding of the file being read by IS has been + determined to be UTF-8. */ +bool +u8_istream_is_utf8 (const struct u8_istream *is) +{ + return is->state == S_UTF8; +} diff --git a/src/libpspp/u8-istream.h b/src/libpspp/u8-istream.h new file mode 100644 index 00000000..3e2acee4 --- /dev/null +++ b/src/libpspp/u8-istream.h @@ -0,0 +1,45 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef LIBPSPP_U8_ISTREAM_H +#define LIBPSPP_U8_ISTREAM_H 1 + +#include +#include + +/* u8_istream. + + Reads a text file and reencodes its contents into UTF-8, with optional + automatic encoding detection. +*/ + +#define U8_ISTREAM_BUFFER_SIZE 4096 + +struct u8_istream *u8_istream_for_fd (const char *fromcode, int fd); +struct u8_istream *u8_istream_for_file (const char *fromcode, + const char *filename, int flags); + +int u8_istream_close (struct u8_istream *); +void u8_istream_free (struct u8_istream *); + +ssize_t u8_istream_read (struct u8_istream *, char *, size_t); + +int u8_istream_fileno (const struct u8_istream *); + +bool u8_istream_is_auto (const struct u8_istream *); +bool u8_istream_is_utf8 (const struct u8_istream *); + +#endif /* libpspp/u8-istream.h */ diff --git a/tests/automake.mk b/tests/automake.mk index 7ef7d423..0b4a825c 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -24,6 +24,7 @@ check_PROGRAMS += \ tests/libpspp/string-set-test \ tests/libpspp/stringi-set-test \ tests/libpspp/tower-test \ + tests/libpspp/u8-istream-test \ tests/output/render-test check-programs: $(check_PROGRAMS) @@ -161,6 +162,9 @@ tests_libpspp_tower_test_SOURCES = \ tests_libpspp_tower_test_LDADD = gl/libgl.la $(LIBINTL) tests_libpspp_tower_test_CPPFLAGS = $(AM_CPPFLAGS) -DASSERT_LEVEL=10 +tests_libpspp_u8_istream_test_SOURCES = tests/libpspp/u8-istream-test.c +tests_libpspp_u8_istream_test_LDADD = src/libpspp/libpspp.la gl/libgl.la + tests_libpspp_sparse_array_test_SOURCES = \ src/libpspp/sparse-array.c \ src/libpspp/pool.c \ @@ -336,6 +340,7 @@ TESTSUITE_AT = \ tests/libpspp/string-set.at \ tests/libpspp/stringi-set.at \ tests/libpspp/tower.at \ + tests/libpspp/u8-istream.at \ tests/math/moments.at \ tests/math/randist.at \ tests/output/charts.at \ diff --git a/tests/libpspp/u8-istream-test.c b/tests/libpspp/u8-istream-test.c new file mode 100644 index 00000000..ab1b717e --- /dev/null +++ b/tests/libpspp/u8-istream-test.c @@ -0,0 +1,126 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "libpspp/u8-istream.h" + +#include +#include +#include +#include +#include +#include + +#include "libpspp/i18n.h" + +#include "gl/error.h" +#include "gl/progname.h" +#include "gl/xalloc.h" + +static void +usage (void) +{ + printf ("usage: %s COMMAND [ARG]...\n" + "The available commands are:\n" + " help\n" + " print this usage message\n" + " buffer-size\n" + " print the buffer size, in bytes, on stdout\n" + " read FILE ENCODING [OUTBUF]\n" + " read FILE encoded in ENCODING (with output buffer size\n" + " OUTBUF) and print it on stdout in UTF-8\n", + program_name); + exit (0); +} + +static void +cmd_read (int argc, char *argv[]) +{ + struct u8_istream *is; + const char *encoding; + const char *filename; + int outbufsize; + char *buffer; + + if (argc < 4 || argc > 5) + error (1, 0, "bad syntax for `%s' command; use `%s help' for help", + argv[1], program_name); + + outbufsize = argc > 4 ? atoi (argv[4]) : 4096; + buffer = xmalloc (outbufsize); + + filename = argv[2]; + encoding = *argv[3] ? argv[3] : NULL; + + is = (!strcmp(filename, "-") + ? u8_istream_for_fd (encoding, STDIN_FILENO) + : u8_istream_for_file (encoding, filename, O_RDONLY)); + if (is == NULL) + error (1, errno, "u8_istream_open failed"); + + if (u8_istream_is_auto (is)) + printf ("Auto mode\n"); + else if (u8_istream_is_utf8 (is)) + printf ("UTF-8 mode\n"); + + for (;;) + { + ssize_t n; + + n = u8_istream_read (is, buffer, outbufsize); + if (n > 0) + fwrite (buffer, 1, n, stdout); + else if (n < 0) + error (1, errno, "u8_istream_read failed"); + else + break; + } + + if (u8_istream_is_auto (is)) + printf ("Auto mode\n"); + else if (u8_istream_is_utf8 (is)) + printf ("UTF-8 mode\n"); + + if (!strcmp(filename, "-")) + u8_istream_free (is); + else + { + if (u8_istream_close (is) != 0) + error (1, errno, "u8_istream_close failed"); + } +} + +int +main (int argc, char *argv[]) +{ + set_program_name (argv[0]); + i18n_init (); + + if (argc < 2) + error (1, 0, "missing command name; use `%s help' for help", program_name); + else if (!strcmp(argv[1], "help") || !strcmp(argv[1], "--help")) + usage (); + else if (!strcmp(argv[1], "buffer-size")) + printf ("%d\n", U8_ISTREAM_BUFFER_SIZE); + else if (!strcmp(argv[1], "read")) + cmd_read (argc, argv); + else + error (1, 0, "unknown command `%s'; use `%s help' for help", + argv[1], program_name); + + return 0; +} diff --git a/tests/libpspp/u8-istream.at b/tests/libpspp/u8-istream.at new file mode 100644 index 00000000..2d8baa47 --- /dev/null +++ b/tests/libpspp/u8-istream.at @@ -0,0 +1,142 @@ +AT_BANNER([u8_istream]) + +AT_SETUP([read ASCII]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings ASCII]) +AT_CHECK([echo string | u8-istream-test read - ASCII], [0], [string +]) +AT_CLEANUP + +AT_SETUP([read UTF-8]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([printf '\346\227\245\346\234\254\350\252\236\n' | u8-istream-test read - UTF-8], [0], [dnl +UTF-8 mode +日本語 +UTF-8 mode +]) +AT_CLEANUP + +AT_SETUP([read EUC-JP]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings EUC-JP]) +AT_CHECK([printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251 \244\252\n' | u8-istream-test read - EUC-JP], + [0], + [ぁ あ ぃ い ぅ う ぇ え ぉ お +]) +AT_CLEANUP + +AT_SETUP([read UTF-8 with character split across input buffers]) +AT_KEYWORDS([u8_istream]) +buffer_size=`u8-istream-test buffer-size` +($PERL -e "print 'x' x ($buffer_size - 16)" + printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n') > input +(echo "UTF-8 mode" + cat input + echo "UTF-8 mode") > expout +AT_CHECK([u8-istream-test read input UTF-8 16], [0], [expout]) +AT_CLEANUP + +AT_SETUP([read UTF-8 with character split across output buffers]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n' | u8-istream-test read - UTF-8 16], [0], [dnl +UTF-8 mode +ぁあぃいぅうぇえぉお +UTF-8 mode +]) +AT_CLEANUP + +AT_SETUP([read UTF-8 with character split across input and output buffers]) +AT_KEYWORDS([u8_istream]) +buffer_size=`u8-istream-test buffer-size` +($PERL -e "print 'x' x ($buffer_size - 16)" + printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n') > input +(echo "UTF-8 mode" + cat input + echo "UTF-8 mode") > expout +AT_CHECK([u8-istream-test read input UTF-8 16], [0], [expout]) +AT_CLEANUP + +AT_SETUP([read EUC-JP with character split across input buffers]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings EUC-JP]) +buffer_size=`u8-istream-test buffer-size` +($PERL -e "print 'x' x ($buffer_size - 16)" + printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 ' + printf '\244\250 \244\251 \244\252\n') > input +($PERL -e "print 'x' x ($buffer_size - 16)" + printf '\343\201\201\040\343\201\202\040\343\201\203\040\343\201\204\040' + printf '\343\201\205\040\343\201\206\040\343\201\207\040\343\201\210\040' + printf '\343\201\211\040\343\201\212\n') > expout +AT_CHECK([u8-istream-test read input EUC-JP], [0], [expout]) +AT_CLEANUP + +AT_SETUP([read EUC-JP with character split across output buffers]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings EUC-JP]) +AT_CHECK([printf '\244\241\244\242\244\243\244\244\244\245\244\246\244\247\244\250\244\251\244\252\n' | u8-istream-test read - EUC-JP 16], + [0], + [ぁあぃいぅうぇえぉお +]) +AT_CLEANUP + +AT_SETUP([read EUC-JP with character split across input and output buffers]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings EUC-JP]) +buffer_size=`u8-istream-test buffer-size` +($PERL -e "print 'x' x ($buffer_size - 16)" + printf 'xyz\244\241\244\242\244\243\244\244\244\245\244\246\244\247\244\250' + printf '\244\251\244\252\n') > input +($PERL -e "print 'x' x ($buffer_size - 16)" + printf '\170\171\172\343\201\201\343\201\202\343\201\203\343\201\204\343' + printf '\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201' + printf '\212\n') > expout +AT_CHECK([u8-istream-test read input EUC-JP 16], [0], [expout]) +AT_CLEANUP + +AT_SETUP([read ASCII as Auto]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([echo string | u8-istream-test read - Auto], [0], [dnl +Auto mode +string +Auto mode +]) +AT_CLEANUP + +AT_SETUP([read UTF-8 as Auto]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([printf 'entr\303\251e\n' | u8-istream-test read - Auto], [0], [dnl +Auto mode +entrée +UTF-8 mode +]) +AT_CLEANUP + +AT_SETUP([read ISO-8859-1 as Auto,ISO-8859-1]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings ISO-8859-1]) +buffer_size=`u8-istream-test buffer-size` +($PERL -e "print 'x' x int($buffer_size * 2.5)"; printf 'entr\351e\n') > input +(echo "Auto mode" + $PERL -e "print 'x' x int($buffer_size * 2.5)" + printf 'entr\303\251e\n') > expout +AT_CHECK([u8-istream-test read input Auto,ISO-8859-1], [0], [expout]) +AT_CLEANUP + +dnl UTF-16BE is not ASCII compatible so this doesn't start out in Auto mode. +AT_SETUP([read UTF-16BE as Auto,UTF-16BE]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings UTF-16BE]) +AT_CHECK([printf '\0e\0n\0t\0r\0\351\0e\0\n' | u8-istream-test read - Auto,UTF-16BE], + [0], [dnl +entrée +]) +AT_CLEANUP + +AT_SETUP([read EUC-JP as Auto,EUC-JP]) +AT_KEYWORDS([u8_istream]) +AT_CHECK([supports_encodings EUC-JP]) +AT_CHECK([printf 'entr\217\253\261e\n' | u8-istream-test read - Auto,EUC-JP], + [0], [entrée +]) +AT_CLEANUP + -- 2.30.2