pintos-os.org Git - pspp/blob - src/libpspp/u8-istream.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "u8-istream.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <iconv.h>
  25 #include <limits.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31 #include <unistr.h>
  32
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/cast.h"
  35 #include "libpspp/compiler.h"
  36 #include "libpspp/encoding-guesser.h"
  37 #include "libpspp/i18n.h"
  38
  39 #include "gl/c-strcase.h"
  40 #include "gl/localcharset.h"
  41 #include "gl/minmax.h"
  42
  43 enum u8_istream_state
  44   {
  45     S_AUTO,                     /* Stream encoding not yet known. */
  46     S_UTF8,                     /* Stream encoding is known to be UTF-8. */
  47     S_CONVERT                   /* Stream encoding is known but not UTF-8. */
  48   };
  49
  50 struct u8_istream
  51   {
  52     int fd;
  53     iconv_t converter;
  54     enum u8_istream_state state;
  55
  56     char *buffer;
  57     char *head;
  58     size_t length;
  59
  60     char outbuf[4];
  61     size_t outlen;
  62   };
  63
  64 static ssize_t fill_buffer (struct u8_istream *);
  65
  66 /* Opens FILENAME, which is encoded in FROMCODE, for reading as an UTF-8
  67    stream, passing FLAGS to the open() function.  Returns a new u8_istream if
  68    successful, otherwise returns NULL and sets errno to an appropriate value.
  69
  70    The accepted forms for FROMCODE are listed at the top of
  71    encoding-guesser.h. */
  72 struct u8_istream *
  73 u8_istream_for_file (const char *fromcode, const char *filename, int flags)
  74 {
  75   struct u8_istream *is;
  76   int fd;
  77
  78   assert (!(flags & O_CREAT));
  79
  80   fd = open (filename, flags);
  81   if (fd < 0)
  82     return NULL;
  83
  84   is = u8_istream_for_fd (fromcode, fd);
  85   if (is == NULL)
  86     {
  87       int save_errno = errno;
  88       close (fd);
  89       errno = save_errno;
  90     }
  91
  92   return is;
  93 }
  94
  95 /* Creates and returns a new u8_istream that reads its input from FD.  Returns
  96    a new u8_istream if successful, otherwise returns NULL and sets errno to an
  97    appropriate value.
  98
  99    The accepted forms for FROMCODE are listed at the top of
 100    encoding-guesser.h. */
 101 struct u8_istream *
 102 u8_istream_for_fd (const char *fromcode, int fd)
 103 {
 104   struct u8_istream *is;
 105   const char *encoding;
 106
 107   is = malloc (sizeof *is);
 108   if (is == NULL)
 109     return NULL;
 110
 111   is->fd = fd;
 112   is->converter = (iconv_t) -1;
 113   is->buffer = malloc (U8_ISTREAM_BUFFER_SIZE);
 114   if (is->buffer == NULL)
 115     goto error;
 116   is->head = is->buffer;
 117   is->length = 0;
 118   is->outlen = 0;
 119
 120   if (fill_buffer (is) < 0)
 121     goto error;
 122
 123   encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length);
 124   if (is_encoding_utf8 (encoding))
 125     {
 126       unsigned int bom_len;
 127
 128       is->state = S_UTF8;
 129       bom_len = encoding_guess_bom_length (encoding, is->buffer, is->length);
 130       is->head += bom_len;
 131       is->length -= bom_len;
 132     }
 133   else
 134     {
 135       if (encoding_guess_encoding_is_auto (fromcode)
 136           && !strcmp (encoding, "ASCII"))
 137         {
 138           is->state = S_AUTO;
 139           encoding = encoding_guess_parse_encoding (fromcode);
 140         }
 141       else
 142         is->state = S_CONVERT;
 143
 144       is->converter = iconv_open ("UTF-8", encoding);
 145       if (is->converter == (iconv_t) -1)
 146         goto error;
 147     }
 148
 149   return is;
 150
 151 error:
 152   u8_istream_free (is);
 153   return NULL;
 154 }
 155
 156 /* Closes IS and its underlying file descriptor and frees all associated
 157    resources.  Returns the return value from close(). */
 158 int
 159 u8_istream_close (struct u8_istream *is)
 160 {
 161   if (is != NULL)
 162     {
 163       int fd = is->fd;
 164       u8_istream_free (is);
 165       return close (fd);
 166     }
 167   return 0;
 168 }
 169
 170 /* Frees IS and associated resources, but does not close the underlying file
 171    descriptor.  (Thus, the client must close the file descriptor when it is no
 172    longer needed.) */
 173 void
 174 u8_istream_free (struct u8_istream *is)
 175 {
 176   if (is != NULL)
 177     {
 178       if (is->converter != (iconv_t) -1)
 179         iconv_close (is->converter);
 180       free (is->buffer);
 181       free (is);
 182     }
 183 }
 184
 185 static void
 186 substitute_invalid_input_byte (struct u8_istream *is)
 187 {
 188   assert (is->outlen == 0);
 189   is->head++;
 190   is->length--;
 191   is->outlen = u8_uctomb (CHAR_CAST (uint8_t *, is->outbuf),
 192                           0xfffd, sizeof is->outbuf);
 193 }
 194
 195 static ssize_t
 196 fill_buffer (struct u8_istream *is)
 197 {
 198   ssize_t n;
 199
 200   /* Move any unused bytes to the beginning of the input buffer. */
 201   if (is->length > 0 && is->buffer != is->head)
 202     memmove (is->buffer, is->head, is->length);
 203   is->head = is->buffer;
 204
 205   /* Read more input. */
 206   do
 207     {
 208       n = read (is->fd, is->buffer + is->length,
 209                 U8_ISTREAM_BUFFER_SIZE - is->length);
 210     }
 211   while (n < 0 && errno == EINTR);
 212   if (n > 0)
 213     is->length += n;
 214   return n;
 215 }
 216
 217 static ssize_t
 218 read_auto (struct u8_istream *is, char *buffer, size_t size)
 219 {
 220   size_t original_size = size;
 221   int retval = 0;
 222
 223   while (size > 0)
 224     {
 225       if (is->length > 0)
 226         {
 227           size_t n_ascii;
 228
 229           n_ascii = encoding_guess_count_ascii (is->head,
 230                                                 MIN (is->length, size));
 231
 232           memcpy (buffer, is->head, n_ascii);
 233           buffer += n_ascii;
 234           size -= n_ascii;
 235
 236           is->head += n_ascii;
 237           is->length -= n_ascii;
 238
 239           if (size == 0)
 240             break;
 241         }
 242
 243       if (is->length == 0)
 244         {
 245           retval = fill_buffer (is);
 246           if (retval > 0)
 247             continue;
 248           else
 249             break;
 250         }
 251
 252       /* is->head points to a byte that isn't a printable ASCII character.
 253          Fill up the buffer and check for UTF-8. */
 254       fill_buffer (is);
 255       is->state = (encoding_guess_tail_is_utf8 (is->head, is->length)
 256                    ? S_UTF8 : S_CONVERT);
 257       if (size == original_size)
 258         return u8_istream_read (is, buffer, size);
 259       break;
 260     }
 261
 262   return original_size - size;
 263 }
 264
 265 static int
 266 convert_iconv (iconv_t converter,
 267                char **inbufp, size_t *inbytesleft,
 268                char **outbufp, size_t *outbytesleft)
 269 {
 270   size_t n = iconv (converter, (ICONV_CONST char **) inbufp, inbytesleft,
 271                     outbufp, outbytesleft);
 272   return n == SIZE_MAX ? errno : 0;
 273 }
 274
 275 static int
 276 convert_utf8 (iconv_t converter UNUSED,
 277               char **inbufp, size_t *inbytesleft,
 278               char **outbufp, size_t *outbytesleft)
 279 {
 280   const uint8_t *in = CHAR_CAST (const uint8_t *, *inbufp);
 281   size_t n = MIN (*inbytesleft, *outbytesleft);
 282   size_t ofs = 0;
 283   int error;
 284
 285   for (;;)
 286     {
 287       ucs4_t uc;
 288       int mblen;
 289
 290       if (ofs >= n)
 291         {
 292           error = ofs < *inbytesleft ? E2BIG : 0;
 293           break;
 294         }
 295
 296       mblen = u8_mbtouc (&uc, in + ofs, n - ofs);
 297       if (uc == 0xfffd)
 298         {
 299           int retval = u8_mbtoucr (&uc, in + ofs, *inbytesleft - ofs);
 300           if (retval == mblen)
 301             {
 302               /* There's an actual U+FFFD in the input stream.  Carry on. */
 303             }
 304           else
 305             {
 306               error = (retval == -1 ? EILSEQ
 307                        : retval == -2 ? EINVAL
 308                        : E2BIG);
 309               break;
 310             }
 311         }
 312
 313       ofs += mblen;
 314     }
 315
 316   if (ofs > 0)
 317     {
 318       memcpy (*outbufp, *inbufp, ofs);
 319       *inbufp += ofs;
 320       *inbytesleft -= ofs;
 321       *outbufp += ofs;
 322       *outbytesleft -= ofs;
 323     }
 324
 325   return error;
 326 }
 327
 328 static ssize_t
 329 read_convert (struct u8_istream *is,
 330               int (*convert) (iconv_t converter,
 331                               char **inbufp, size_t *inbytesleft,
 332                               char **outbufp, size_t *outbytesleft),
 333               char *buffer, size_t size)
 334 {
 335   size_t original_size = size;
 336
 337   while (size > 0)
 338     {
 339       ssize_t n_read;
 340
 341       if (is->outlen > 0)
 342         {
 343           size_t n = MIN (size, is->outlen);
 344
 345           memcpy (buffer, is->outbuf, n);
 346           is->outlen -= n;
 347           if (is->outlen > 0)
 348             memmove (is->outbuf, is->outbuf + n, is->outlen);
 349
 350           buffer += n;
 351           size -= n;
 352
 353           if (size == 0)
 354             break;
 355         }
 356
 357       if (is->length)
 358         {
 359           int error = convert (is->converter,
 360                                &is->head, &is->length,
 361                                &buffer, &size);
 362           if (size == 0)
 363             break;
 364
 365           switch (error)
 366             {
 367             case 0:
 368               /* Converted all of the input into output, possibly with space
 369                  for output left over.
 370
 371                  Read more input. */
 372               break;
 373
 374             case EILSEQ:
 375               substitute_invalid_input_byte (is);
 376               continue;
 377
 378             case EINVAL:
 379               /* Incomplete byte sequence at end of input.  Read more
 380                  input. */
 381               break;
 382
 383             default:
 384               /* A real error of some kind (ENOMEM?). */
 385               return -1;
 386
 387             case E2BIG:
 388               /* Ran out of room for output.
 389                  Convert into outbuf and copy from there instead. */
 390               {
 391                 char *outptr = is->outbuf;
 392                 size_t outleft = sizeof is->outbuf;
 393
 394                 error = convert (is->converter,
 395                                  &is->head, &is->length,
 396                                  &outptr, &outleft);
 397                 is->outlen = outptr - is->outbuf;
 398                 if (is->outlen > 0)
 399                   continue;
 400
 401                 switch (error)
 402                   {
 403                   case EILSEQ:
 404                     substitute_invalid_input_byte (is);
 405                     continue;
 406
 407                   case E2BIG:
 408                   case EINVAL:
 409                     continue;
 410
 411                   default:
 412                     /* A real error of some kind (ENOMEM?). */
 413                     return -1;
 414                   }
 415               }
 416             }
 417         }
 418
 419       assert (is->length <= MB_LEN_MAX);
 420       n_read = fill_buffer (is);
 421       if (n_read <= 0)
 422         {
 423           if (original_size != size)
 424             {
 425               /* We produced some output so don't report EOF or error yet. */
 426               break;
 427             }
 428           else if (n_read == 0 && is->length != 0)
 429             {
 430               /* Incomplete byte sequence at end of file. */
 431               substitute_invalid_input_byte (is);
 432             }
 433           else
 434             {
 435               /* Propagate end-of-file or error to caller. */
 436               return n_read;
 437             }
 438         }
 439     }
 440
 441   return original_size - size;
 442 }
 443
 444 /* Reads up to SIZE bytes of UTF-8 text from IS into BUFFER.  Returns the
 445    number of bytes read if successful, 0 at end of file, or -1 if an error
 446    occurred before any data could be read.  Upon error, sets errno to an
 447    appropriate value. */
 448 ssize_t
 449 u8_istream_read (struct u8_istream *is, char *buffer, size_t size)
 450 {
 451   switch (is->state)
 452     {
 453     case S_CONVERT:
 454       return read_convert (is, convert_iconv, buffer, size);
 455
 456     case S_AUTO:
 457       return read_auto (is, buffer, size);
 458
 459     case S_UTF8:
 460       return read_convert (is, convert_utf8, buffer, size);
 461     }
 462
 463   NOT_REACHED ();
 464 }
 465
 466 /* Returns the file descriptor underlying IS. */
 467 int
 468 u8_istream_fileno (const struct u8_istream *is)
 469 {
 470   return is->fd;
 471 }
 472 \f
 473 /* Test functions.
 474
 475    These functions are probably useful only for white-box testing. */
 476
 477 /* Returns true if the encoding of the file being read by IS is not yet
 478    known. */
 479 bool
 480 u8_istream_is_auto (const struct u8_istream *is)
 481 {
 482   return is->state == S_AUTO;
 483 }
 484
 485 /* Returns true if the encoding of the file being read by IS has been
 486    determined to be UTF-8. */
 487 bool
 488 u8_istream_is_utf8 (const struct u8_istream *is)
 489 {
 490   return is->state == S_UTF8;
 491 }