u8-istream: New library for reading a text file and recoding to UTF-8.

author Ben Pfaff <blp@cs.stanford.edu>

Sat, 19 Mar 2011 23:34:53 +0000 (16:34 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 20 Mar 2011 16:43:44 +0000 (09:43 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sat, 19 Mar 2011 23:34:53 +0000 (16:34 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 20 Mar 2011 16:43:44 +0000 (09:43 -0700)
diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk

index 5cf660ace5f417cbfdd551265e62d5630a137e07..a7c92830ff710013fa2f19c30da87742980ed3ac 100644 (file)
--- a/src/libpspp/automake.mk
+++ b/src/libpspp/automake.mk
@@ -88,6 +88,8 @@ src_libpspp_libpspp_la_SOURCES = \
         src/libpspp/temp-file.h \
         src/libpspp/tower.c \
         src/libpspp/tower.h \
+       src/libpspp/u8-istream.c \
+       src/libpspp/u8-istream.h \
         src/libpspp/version.h \
         src/libpspp/zip-writer.c \
         src/libpspp/zip-writer.h
diff --git a/src/libpspp/u8-istream.c b/src/libpspp/u8-istream.c

new file mode 100644 (file)

index 0000000..6d4d770
--- /dev/null
+++ b/src/libpspp/u8-istream.c
@@ -0,0 +1,475 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "u8-istream.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <iconv.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <unistr.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+#include "libpspp/compiler.h"
+#include "libpspp/encoding-guesser.h"
+
+#include "gl/c-strcase.h"
+#include "gl/localcharset.h"
+#include "gl/minmax.h"
+
+enum u8_istream_state
+  {
+    S_AUTO,                     /* Stream encoding not yet known. */
+    S_UTF8,                     /* Stream encoding is known to be UTF-8. */
+    S_CONVERT                   /* Stream encoding is known but not UTF-8. */
+  };
+
+struct u8_istream
+  {
+    int fd;
+    iconv_t converter;
+    enum u8_istream_state state;
+
+    char *buffer;
+    char *head;
+    size_t length;
+
+    char outbuf[4];
+    size_t outlen;
+  };
+
+static ssize_t fill_buffer (struct u8_istream *);
+
+/* Opens FILENAME, which is encoded in FROMCODE, for reading as an UTF-8
+   stream, passing FLAGS to the open() function.  Returns a new u8_istream if
+   successful, otherwise returns NULL and sets errno to an appropriate value.
+
+   The accepted forms for FROMCODE are listed at the top of
+   encoding-guesser.h. */
+struct u8_istream *
+u8_istream_for_file (const char *fromcode, const char *filename, int flags)
+{
+  struct u8_istream *is;
+  int fd;
+
+  assert (!(flags & O_CREAT));
+
+  fd = open (filename, flags);
+  if (fd < 0)
+    return NULL;
+
+  is = u8_istream_for_fd (fromcode, fd);
+  if (is == NULL)
+    {
+      int save_errno = errno;
+      close (fd);
+      errno = save_errno;
+    }
+
+  return is;
+}
+
+/* Creates and returns a new u8_istream that reads its input from FD.  Returns
+   a new u8_istream if successful, otherwise returns NULL and sets errno to an
+   appropriate value.
+
+   The accepted forms for FROMCODE are listed at the top of
+   encoding-guesser.h. */
+struct u8_istream *
+u8_istream_for_fd (const char *fromcode, int fd)
+{
+  struct u8_istream *is;
+  const char *encoding;
+
+  is = malloc (sizeof *is);
+  if (is == NULL)
+    return NULL;
+
+  is->fd = fd;
+  is->converter = (iconv_t) -1;
+  is->buffer = malloc (U8_ISTREAM_BUFFER_SIZE);
+  if (is->buffer == NULL)
+    goto error;
+  is->head = is->buffer;
+  is->length = 0;
+  is->outlen = 0;
+
+  if (fill_buffer (is) < 0)
+    goto error;
+
+  encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length);
+  if (!strcmp (encoding, "UTF-8"))
+    is->state = S_UTF8;
+  else
+    {
+      if (encoding_guess_encoding_is_auto (fromcode)
+          && !strcmp (encoding, "ASCII"))
+        is->state = S_AUTO;
+      else
+        is->state = S_CONVERT;
+
+      is->converter = iconv_open ("UTF-8",
+                                  encoding_guess_parse_encoding (fromcode));
+      if (is->converter == (iconv_t) -1)
+        goto error;
+    }
+
+  return is;
+
+error:
+  u8_istream_free (is);
+  return NULL;
+}
+
+/* Closes IS and its underlying file descriptor and frees all associated
+   resources.  Returns the return value from close(). */
+int
+u8_istream_close (struct u8_istream *is)
+{
+  if (is != NULL)
+    {
+      int fd = is->fd;
+      u8_istream_free (is);
+      return close (fd);
+    }
+  return 0;
+}
+
+/* Frees IS and associated resources, but does not close the underlying file
+   descriptor.  (Thus, the client must close the file descriptor when it is no
+   longer needed.) */
+void
+u8_istream_free (struct u8_istream *is)
+{
+  if (is != NULL)
+    {
+      if (is->converter != (iconv_t) -1)
+        iconv_close (is->converter);
+      free (is->buffer);
+      free (is);
+    }
+}
+
+static void
+substitute_invalid_input_byte (struct u8_istream *is)
+{
+  assert (is->outlen == 0);
+  is->head++;
+  is->length--;
+  is->outlen = u8_uctomb (CHAR_CAST (uint8_t *, is->outbuf),
+                          0xfffd, sizeof is->outbuf);
+}
+
+static ssize_t
+fill_buffer (struct u8_istream *is)
+{
+  ssize_t n;
+
+  /* Move any unused bytes to the beginning of the input buffer. */
+  if (is->length > 0 && is->buffer != is->head)
+    memmove (is->buffer, is->head, is->length);
+  is->head = is->buffer;
+
+  /* Read more input. */
+  n = read (is->fd, is->buffer + is->length,
+            U8_ISTREAM_BUFFER_SIZE - is->length);
+  if (n > 0)
+    is->length += n;
+  return n;
+}
+
+static ssize_t
+read_auto (struct u8_istream *is, char *buffer, size_t size)
+{
+  size_t original_size = size;
+  int retval = 0;
+
+  while (size > 0)
+    {
+      if (is->length > 0)
+        {
+          size_t n_ascii;
+
+          n_ascii = encoding_guess_count_ascii (is->head,
+                                                MIN (is->length, size));
+
+          memcpy (buffer, is->head, n_ascii);
+          buffer += n_ascii;
+          size -= n_ascii;
+
+          is->head += n_ascii;
+          is->length -= n_ascii;
+
+          if (size == 0)
+            break;
+        }
+
+      if (is->length == 0)
+        {
+          retval = fill_buffer (is);
+          if (retval > 0)
+            continue;
+          else
+            break;
+        }
+
+      /* is->head points to a byte that isn't a printable ASCII character.
+         Fill up the buffer and check for UTF-8. */
+      fill_buffer (is);
+      is->state = (encoding_guess_tail_is_utf8 (is->head, is->length)
+                   ? S_UTF8 : S_CONVERT);
+      if (size == original_size)
+        return u8_istream_read (is, buffer, size);
+      break;
+    }
+
+  return original_size - size;
+}
+
+static int
+convert_iconv (iconv_t converter,
+               char **inbufp, size_t *inbytesleft,
+               char **outbufp, size_t *outbytesleft)
+{
+  size_t n = iconv (converter, inbufp, inbytesleft, outbufp, outbytesleft);
+  return n == SIZE_MAX ? errno : 0;
+}
+
+static int
+convert_utf8 (iconv_t converter UNUSED,
+              char **inbufp, size_t *inbytesleft,
+              char **outbufp, size_t *outbytesleft)
+{
+  const uint8_t *in = CHAR_CAST (const uint8_t *, *inbufp);
+  size_t n = MIN (*inbytesleft, *outbytesleft);
+  size_t ofs = 0;
+  int error;
+
+  for (;;)
+    {
+      ucs4_t uc;
+      int mblen;
+
+      if (ofs >= n)
+        {
+          error = ofs < *inbytesleft ? E2BIG : 0;
+          break;
+        }
+
+      mblen = u8_mbtouc (&uc, in + ofs, n - ofs);
+      if (uc == 0xfffd)
+        {
+          int retval = u8_mbtoucr (&uc, in + ofs, *inbytesleft - ofs);
+          if (retval == mblen)
+            {
+              /* There's an actual U+FFFD in the input stream.  Carry on. */
+            }
+          else
+            {
+              error = (retval == -1 ? EILSEQ
+                       : retval == -2 ? EINVAL
+                       : E2BIG);
+              break;
+            }
+        }
+
+      ofs += mblen;
+    }
+
+  if (ofs > 0)
+    {
+      memcpy (*outbufp, *inbufp, ofs);
+      *inbufp += ofs;
+      *inbytesleft -= ofs;
+      *outbufp += ofs;
+      *outbytesleft -= ofs;
+    }
+
+  return error;
+}
+
+static ssize_t
+read_convert (struct u8_istream *is,
+              int (*convert) (iconv_t converter,
+                              char **inbufp, size_t *inbytesleft,
+                              char **outbufp, size_t *outbytesleft),
+              char *buffer, size_t size)
+{
+  size_t original_size = size;
+
+  while (size > 0)
+    {
+      ssize_t n_read;
+
+      if (is->outlen > 0)
+        {
+          size_t n = MIN (size, is->outlen);
+
+          memcpy (buffer, is->outbuf, n);
+          is->outlen -= n;
+          if (is->outlen > 0)
+            memmove (is->outbuf, is->outbuf + n, is->outlen);
+
+          buffer += n;
+          size -= n;
+
+          if (size == 0)
+            break;
+        }
+
+      if (is->length)
+        {
+          int error = convert (is->converter,
+                               &is->head, &is->length,
+                               &buffer, &size);
+          if (size == 0)
+            break;
+
+          switch (error)
+            {
+            case 0:
+              /* Converted all of the input into output, possibly with space
+                 for output left over.
+
+                 Read more input. */
+              break;
+
+            case EILSEQ:
+              substitute_invalid_input_byte (is);
+              continue;
+
+            case EINVAL:
+              /* Incomplete byte sequence at end of input.  Read more
+                 input. */
+              break;
+
+            default:
+              /* A real error of some kind (ENOMEM?). */
+              return -1;
+
+            case E2BIG:
+              /* Ran out of room for output.
+                 Convert into outbuf and copy from there instead. */
+              {
+                char *outptr = is->outbuf;
+                size_t outleft = sizeof is->outbuf;
+
+                error = convert (is->converter,
+                                 &is->head, &is->length,
+                                 &outptr, &outleft);
+                is->outlen = outptr - is->outbuf;
+                if (is->outlen > 0)
+                  continue;
+
+                switch (error)
+                  {
+                  case EILSEQ:
+                    substitute_invalid_input_byte (is);
+                    continue;
+
+                  case E2BIG:
+                  case EINVAL:
+                    continue;
+
+                  default:
+                    /* A real error of some kind (ENOMEM?). */
+                    return -1;
+                  }
+              }
+            }
+        }
+
+      assert (is->length <= MB_LEN_MAX);
+      n_read = fill_buffer (is);
+      if (n_read <= 0)
+        {
+          if (original_size != size)
+            {
+              /* We produced some output so don't report EOF or error yet. */
+              break;
+            }
+          else if (n_read == 0 && is->length != 0)
+            {
+              /* Incomplete byte sequence at end of file. */
+              substitute_invalid_input_byte (is);
+            }
+          else
+            {
+              /* Propagate end-of-file or error to caller. */
+              return n_read;
+            }
+        }
+    }
+
+  return original_size - size;
+}
+
+/* Reads up to SIZE bytes of UTF-8 text from IS into BUFFER.  Returns the
+   number of bytes read if successful, 0 at end of file, or -1 if an error
+   occurred before any data could be read.  Upon error, sets errno to an
+   appropriate value. */
+ssize_t
+u8_istream_read (struct u8_istream *is, char *buffer, size_t size)
+{
+  switch (is->state)
+    {
+    case S_CONVERT:
+      return read_convert (is, convert_iconv, buffer, size);
+
+    case S_AUTO:
+      return read_auto (is, buffer, size);
+
+    case S_UTF8:
+      return read_convert (is, convert_utf8, buffer, size);
+    }
+
+  NOT_REACHED ();
+}
+
+/* Returns the file descriptor underlying IS. */
+int
+u8_istream_fileno (const struct u8_istream *is)
+{
+  return is->fd;
+}
+\f
+/* Test functions.
+
+   These functions are probably useful only for white-box testing. */
+
+/* Returns true if the encoding of the file being read by IS is not yet
+   known. */
+bool
+u8_istream_is_auto (const struct u8_istream *is)
+{
+  return is->state == S_AUTO;
+}
+
+/* Returns true if the encoding of the file being read by IS has been
+   determined to be UTF-8. */
+bool
+u8_istream_is_utf8 (const struct u8_istream *is)
+{
+  return is->state == S_UTF8;
+}
diff --git a/src/libpspp/u8-istream.h b/src/libpspp/u8-istream.h

new file mode 100644 (file)

index 0000000..3e2acee
--- /dev/null
+++ b/src/libpspp/u8-istream.h
@@ -0,0 +1,45 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef LIBPSPP_U8_ISTREAM_H
+#define LIBPSPP_U8_ISTREAM_H 1
+
+#include <sys/types.h>
+#include <stdbool.h>
+
+/* u8_istream.
+
+   Reads a text file and reencodes its contents into UTF-8, with optional
+   automatic encoding detection.
+*/
+
+#define U8_ISTREAM_BUFFER_SIZE 4096
+
+struct u8_istream *u8_istream_for_fd (const char *fromcode, int fd);
+struct u8_istream *u8_istream_for_file (const char *fromcode,
+                                        const char *filename, int flags);
+
+int u8_istream_close (struct u8_istream *);
+void u8_istream_free (struct u8_istream *);
+
+ssize_t u8_istream_read (struct u8_istream *, char *, size_t);
+
+int u8_istream_fileno (const struct u8_istream *);
+
+bool u8_istream_is_auto (const struct u8_istream *);
+bool u8_istream_is_utf8 (const struct u8_istream *);
+
+#endif /* libpspp/u8-istream.h */
diff --git a/tests/automake.mk b/tests/automake.mk

index 7ef7d423fabf2febea9bb051d715498987db0479..0b4a825c050d487f882c26291b2738165c8a5705 100644 (file)
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -24,6 +24,7 @@ check_PROGRAMS += \
         tests/libpspp/string-set-test \
         tests/libpspp/stringi-set-test \
         tests/libpspp/tower-test \
+       tests/libpspp/u8-istream-test \
         tests/output/render-test
  
  check-programs: $(check_PROGRAMS)
@@ -161,6 +162,9 @@ tests_libpspp_tower_test_SOURCES = \
  tests_libpspp_tower_test_LDADD = gl/libgl.la $(LIBINTL) 
  tests_libpspp_tower_test_CPPFLAGS = $(AM_CPPFLAGS) -DASSERT_LEVEL=10
  
+tests_libpspp_u8_istream_test_SOURCES = tests/libpspp/u8-istream-test.c
+tests_libpspp_u8_istream_test_LDADD = src/libpspp/libpspp.la gl/libgl.la
+
  tests_libpspp_sparse_array_test_SOURCES = \
         src/libpspp/sparse-array.c \
         src/libpspp/pool.c \
@@ -336,6 +340,7 @@ TESTSUITE_AT = \
         tests/libpspp/string-set.at \
         tests/libpspp/stringi-set.at \
         tests/libpspp/tower.at \
+       tests/libpspp/u8-istream.at \
         tests/math/moments.at \
         tests/math/randist.at \
         tests/output/charts.at \
diff --git a/tests/libpspp/u8-istream-test.c b/tests/libpspp/u8-istream-test.c

new file mode 100644 (file)

index 0000000..ab1b717
--- /dev/null
+++ b/tests/libpspp/u8-istream-test.c
@@ -0,0 +1,126 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "libpspp/u8-istream.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libpspp/i18n.h"
+
+#include "gl/error.h"
+#include "gl/progname.h"
+#include "gl/xalloc.h"
+
+static void
+usage (void)
+{
+  printf ("usage: %s COMMAND [ARG]...\n"
+          "The available commands are:\n"
+          "  help\n"
+          "    print this usage message\n"
+          "  buffer-size\n"
+          "    print the buffer size, in bytes, on stdout\n"
+          "  read FILE ENCODING [OUTBUF]\n"
+          "    read FILE encoded in ENCODING (with output buffer size\n"
+          "    OUTBUF) and print it on stdout in UTF-8\n",
+          program_name);
+  exit (0);
+}
+
+static void
+cmd_read (int argc, char *argv[])
+{
+  struct u8_istream *is;
+  const char *encoding;
+  const char *filename;
+  int outbufsize;
+  char *buffer;
+
+  if (argc < 4 || argc > 5)
+    error (1, 0, "bad syntax for `%s' command; use `%s help' for help",
+           argv[1], program_name);
+
+  outbufsize = argc > 4 ? atoi (argv[4]) : 4096;
+  buffer = xmalloc (outbufsize);
+
+  filename = argv[2];
+  encoding = *argv[3] ? argv[3] : NULL;
+
+  is = (!strcmp(filename, "-")
+        ? u8_istream_for_fd (encoding, STDIN_FILENO)
+        : u8_istream_for_file (encoding, filename, O_RDONLY));
+  if (is == NULL)
+    error (1, errno, "u8_istream_open failed");
+
+  if (u8_istream_is_auto (is))
+    printf ("Auto mode\n");
+  else if (u8_istream_is_utf8 (is))
+    printf ("UTF-8 mode\n");
+
+  for (;;)
+    {
+      ssize_t n;
+
+      n = u8_istream_read (is, buffer, outbufsize);
+      if (n > 0)
+        fwrite (buffer, 1, n, stdout);
+      else if (n < 0)
+        error (1, errno, "u8_istream_read failed");
+      else
+        break;
+    }
+
+  if (u8_istream_is_auto (is))
+    printf ("Auto mode\n");
+  else if (u8_istream_is_utf8 (is))
+    printf ("UTF-8 mode\n");
+
+  if (!strcmp(filename, "-"))
+    u8_istream_free (is);
+  else
+    {
+      if (u8_istream_close (is) != 0)
+        error (1, errno, "u8_istream_close failed");
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  set_program_name (argv[0]);
+  i18n_init ();
+
+  if (argc < 2)
+    error (1, 0, "missing command name; use `%s help' for help", program_name);
+  else if (!strcmp(argv[1], "help") || !strcmp(argv[1], "--help"))
+    usage ();
+  else if (!strcmp(argv[1], "buffer-size"))
+    printf ("%d\n", U8_ISTREAM_BUFFER_SIZE);
+  else if (!strcmp(argv[1], "read"))
+    cmd_read (argc, argv);
+  else
+    error (1, 0, "unknown command `%s'; use `%s help' for help",
+           argv[1], program_name);
+
+  return 0;
+}
diff --git a/tests/libpspp/u8-istream.at b/tests/libpspp/u8-istream.at

new file mode 100644 (file)

index 0000000..2d8baa4
--- /dev/null
+++ b/tests/libpspp/u8-istream.at
@@ -0,0 +1,142 @@
+AT_BANNER([u8_istream])
+
+AT_SETUP([read ASCII])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings ASCII])
+AT_CHECK([echo string | u8-istream-test read - ASCII], [0], [string
+])
+AT_CLEANUP
+
+AT_SETUP([read UTF-8])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([printf '\346\227\245\346\234\254\350\252\236\n' | u8-istream-test read - UTF-8], [0], [dnl
+UTF-8 mode
+日本語
+UTF-8 mode
+])
+AT_CLEANUP
+
+AT_SETUP([read EUC-JP])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings EUC-JP])
+AT_CHECK([printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251 \244\252\n' | u8-istream-test read - EUC-JP],
+  [0],
+  [ぁ あ ぃ い ぅ う ぇ え ぉ お
+])
+AT_CLEANUP
+
+AT_SETUP([read UTF-8 with character split across input buffers])
+AT_KEYWORDS([u8_istream])
+buffer_size=`u8-istream-test buffer-size`
+($PERL -e "print 'x' x ($buffer_size - 16)"
+ printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n') > input
+(echo "UTF-8 mode"
+ cat input
+ echo "UTF-8 mode") > expout
+AT_CHECK([u8-istream-test read input UTF-8 16], [0], [expout])
+AT_CLEANUP
+
+AT_SETUP([read UTF-8 with character split across output buffers])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n' | u8-istream-test read - UTF-8 16], [0], [dnl
+UTF-8 mode
+ぁあぃいぅうぇえぉお
+UTF-8 mode
+])
+AT_CLEANUP
+
+AT_SETUP([read UTF-8 with character split across input and output buffers])
+AT_KEYWORDS([u8_istream])
+buffer_size=`u8-istream-test buffer-size`
+($PERL -e "print 'x' x ($buffer_size - 16)"
+ printf '\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n') > input
+(echo "UTF-8 mode"
+ cat input
+ echo "UTF-8 mode") > expout
+AT_CHECK([u8-istream-test read input UTF-8 16], [0], [expout])
+AT_CLEANUP
+
+AT_SETUP([read EUC-JP with character split across input buffers])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings EUC-JP])
+buffer_size=`u8-istream-test buffer-size`
+($PERL -e "print 'x' x ($buffer_size - 16)"
+ printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 \244\247 '
+ printf '\244\250 \244\251 \244\252\n') > input
+($PERL -e "print 'x' x ($buffer_size - 16)"
+ printf '\343\201\201\040\343\201\202\040\343\201\203\040\343\201\204\040'
+ printf '\343\201\205\040\343\201\206\040\343\201\207\040\343\201\210\040'
+ printf '\343\201\211\040\343\201\212\n') > expout
+AT_CHECK([u8-istream-test read input EUC-JP], [0], [expout])
+AT_CLEANUP
+
+AT_SETUP([read EUC-JP with character split across output buffers])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings EUC-JP])
+AT_CHECK([printf '\244\241\244\242\244\243\244\244\244\245\244\246\244\247\244\250\244\251\244\252\n' | u8-istream-test read - EUC-JP 16],
+  [0],
+  [ぁあぃいぅうぇえぉお
+])
+AT_CLEANUP
+
+AT_SETUP([read EUC-JP with character split across input and output buffers])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings EUC-JP])
+buffer_size=`u8-istream-test buffer-size`
+($PERL -e "print 'x' x ($buffer_size - 16)"
+ printf 'xyz\244\241\244\242\244\243\244\244\244\245\244\246\244\247\244\250'
+ printf '\244\251\244\252\n') > input
+($PERL -e "print 'x' x ($buffer_size - 16)"
+ printf '\170\171\172\343\201\201\343\201\202\343\201\203\343\201\204\343'
+ printf '\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201'
+ printf '\212\n') > expout
+AT_CHECK([u8-istream-test read input EUC-JP 16], [0], [expout])
+AT_CLEANUP
+
+AT_SETUP([read ASCII as Auto])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([echo string | u8-istream-test read - Auto], [0], [dnl
+Auto mode
+string
+Auto mode
+])
+AT_CLEANUP
+
+AT_SETUP([read UTF-8 as Auto])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([printf 'entr\303\251e\n' | u8-istream-test read - Auto], [0], [dnl
+Auto mode
+entrée
+UTF-8 mode
+])
+AT_CLEANUP
+
+AT_SETUP([read ISO-8859-1 as Auto,ISO-8859-1])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings ISO-8859-1])
+buffer_size=`u8-istream-test buffer-size`
+($PERL -e "print 'x' x int($buffer_size * 2.5)"; printf 'entr\351e\n') > input
+(echo "Auto mode"
+ $PERL -e "print 'x' x int($buffer_size * 2.5)"
+ printf 'entr\303\251e\n') > expout
+AT_CHECK([u8-istream-test read input Auto,ISO-8859-1], [0], [expout])
+AT_CLEANUP
+
+dnl UTF-16BE is not ASCII compatible so this doesn't start out in Auto mode.
+AT_SETUP([read UTF-16BE as Auto,UTF-16BE])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings UTF-16BE])
+AT_CHECK([printf '\0e\0n\0t\0r\0\351\0e\0\n' | u8-istream-test read - Auto,UTF-16BE], 
+  [0], [dnl
+entrée
+])
+AT_CLEANUP
+
+AT_SETUP([read EUC-JP as Auto,EUC-JP])
+AT_KEYWORDS([u8_istream])
+AT_CHECK([supports_encodings EUC-JP])
+AT_CHECK([printf 'entr\217\253\261e\n' | u8-istream-test read - Auto,EUC-JP], 
+  [0], [entrée
+])
+AT_CLEANUP
+
author	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 19 Mar 2011 23:34:53 +0000 (16:34 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 20 Mar 2011 16:43:44 +0000 (09:43 -0700)
src/libpspp/automake.mk		patch \| blob \| history
src/libpspp/u8-istream.c	[new file with mode: 0644]	patch \| blob
src/libpspp/u8-istream.h	[new file with mode: 0644]	patch \| blob
tests/automake.mk		patch \| blob \| history
tests/libpspp/u8-istream-test.c	[new file with mode: 0644]	patch \| blob
tests/libpspp/u8-istream.at	[new file with mode: 0644]	patch \| blob