1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "u8-istream.h"
33 #include "libpspp/assertion.h"
34 #include "libpspp/cast.h"
35 #include "libpspp/compiler.h"
36 #include "libpspp/encoding-guesser.h"
37 #include "libpspp/i18n.h"
39 #include "gl/c-strcase.h"
40 #include "gl/localcharset.h"
41 #include "gl/minmax.h"
45 S_AUTO, /* Stream encoding not yet known. */
46 S_UTF8, /* Stream encoding is known to be UTF-8. */
47 S_CONVERT /* Stream encoding is known but not UTF-8. */
54 enum u8_istream_state state;
64 static ssize_t fill_buffer (struct u8_istream *);
66 /* Opens FILENAME, which is encoded in FROMCODE, for reading as an UTF-8
67 stream, passing FLAGS to the open() function. Returns a new u8_istream if
68 successful, otherwise returns NULL and sets errno to an appropriate value.
70 The accepted forms for FROMCODE are listed at the top of
71 encoding-guesser.h. */
73 u8_istream_for_file (const char *fromcode, const char *filename, int flags)
75 struct u8_istream *is;
78 assert (!(flags & O_CREAT));
80 fd = open (filename, flags);
84 is = u8_istream_for_fd (fromcode, fd);
87 int save_errno = errno;
95 /* Creates and returns a new u8_istream that reads its input from FD. Returns
96 a new u8_istream if successful, otherwise returns NULL and sets errno to an
99 The accepted forms for FROMCODE are listed at the top of
100 encoding-guesser.h. */
102 u8_istream_for_fd (const char *fromcode, int fd)
104 struct u8_istream *is;
105 const char *encoding;
107 is = malloc (sizeof *is);
112 is->converter = (iconv_t) -1;
113 is->buffer = malloc (U8_ISTREAM_BUFFER_SIZE);
114 if (is->buffer == NULL)
116 is->head = is->buffer;
120 if (fill_buffer (is) < 0)
123 encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length);
124 if (is_encoding_utf8 (encoding))
126 unsigned int bom_len;
129 bom_len = encoding_guess_bom_length (encoding, is->buffer, is->length);
131 is->length -= bom_len;
135 if (encoding_guess_encoding_is_auto (fromcode)
136 && !strcmp (encoding, "ASCII"))
139 encoding = encoding_guess_parse_encoding (fromcode);
142 is->state = S_CONVERT;
144 is->converter = iconv_open ("UTF-8", encoding);
145 if (is->converter == (iconv_t) -1)
152 u8_istream_free (is);
156 /* Closes IS and its underlying file descriptor and frees all associated
157 resources. Returns the return value from close(). */
159 u8_istream_close (struct u8_istream *is)
164 u8_istream_free (is);
170 /* Frees IS and associated resources, but does not close the underlying file
171 descriptor. (Thus, the client must close the file descriptor when it is no
174 u8_istream_free (struct u8_istream *is)
178 if (is->converter != (iconv_t) -1)
179 iconv_close (is->converter);
186 substitute_invalid_input_byte (struct u8_istream *is)
188 assert (is->outlen == 0);
191 is->outlen = u8_uctomb (CHAR_CAST (uint8_t *, is->outbuf),
192 0xfffd, sizeof is->outbuf);
196 fill_buffer (struct u8_istream *is)
200 /* Move any unused bytes to the beginning of the input buffer. */
201 if (is->length > 0 && is->buffer != is->head)
202 memmove (is->buffer, is->head, is->length);
203 is->head = is->buffer;
205 /* Read more input. */
208 n = read (is->fd, is->buffer + is->length,
209 U8_ISTREAM_BUFFER_SIZE - is->length);
211 while (n < 0 && errno == EINTR);
218 read_auto (struct u8_istream *is, char *buffer, size_t size)
220 size_t original_size = size;
229 n_ascii = encoding_guess_count_ascii (is->head,
230 MIN (is->length, size));
232 memcpy (buffer, is->head, n_ascii);
237 is->length -= n_ascii;
245 retval = fill_buffer (is);
252 /* is->head points to a byte that isn't a printable ASCII character.
253 Fill up the buffer and check for UTF-8. */
255 is->state = (encoding_guess_tail_is_utf8 (is->head, is->length)
256 ? S_UTF8 : S_CONVERT);
257 if (size == original_size)
258 return u8_istream_read (is, buffer, size);
262 return original_size - size;
266 convert_iconv (iconv_t converter,
267 char **inbufp, size_t *inbytesleft,
268 char **outbufp, size_t *outbytesleft)
270 size_t n = iconv (converter, (ICONV_CONST char **) inbufp, inbytesleft,
271 outbufp, outbytesleft);
272 return n == SIZE_MAX ? errno : 0;
276 convert_utf8 (iconv_t converter UNUSED,
277 char **inbufp, size_t *inbytesleft,
278 char **outbufp, size_t *outbytesleft)
280 const uint8_t *in = CHAR_CAST (const uint8_t *, *inbufp);
281 size_t n = MIN (*inbytesleft, *outbytesleft);
292 error = ofs < *inbytesleft ? E2BIG : 0;
296 mblen = u8_mbtouc (&uc, in + ofs, n - ofs);
299 int retval = u8_mbtoucr (&uc, in + ofs, *inbytesleft - ofs);
302 /* There's an actual U+FFFD in the input stream. Carry on. */
306 error = (retval == -1 ? EILSEQ
307 : retval == -2 ? EINVAL
318 memcpy (*outbufp, *inbufp, ofs);
322 *outbytesleft -= ofs;
329 read_convert (struct u8_istream *is,
330 int (*convert) (iconv_t converter,
331 char **inbufp, size_t *inbytesleft,
332 char **outbufp, size_t *outbytesleft),
333 char *buffer, size_t size)
335 size_t original_size = size;
343 size_t n = MIN (size, is->outlen);
345 memcpy (buffer, is->outbuf, n);
348 memmove (is->outbuf, is->outbuf + n, is->outlen);
359 int error = convert (is->converter,
360 &is->head, &is->length,
368 /* Converted all of the input into output, possibly with space
369 for output left over.
375 substitute_invalid_input_byte (is);
379 /* Incomplete byte sequence at end of input. Read more
384 /* A real error of some kind (ENOMEM?). */
388 /* Ran out of room for output.
389 Convert into outbuf and copy from there instead. */
391 char *outptr = is->outbuf;
392 size_t outleft = sizeof is->outbuf;
394 error = convert (is->converter,
395 &is->head, &is->length,
397 is->outlen = outptr - is->outbuf;
404 substitute_invalid_input_byte (is);
412 /* A real error of some kind (ENOMEM?). */
419 assert (is->length <= MB_LEN_MAX);
420 n_read = fill_buffer (is);
423 if (original_size != size)
425 /* We produced some output so don't report EOF or error yet. */
428 else if (n_read == 0 && is->length != 0)
430 /* Incomplete byte sequence at end of file. */
431 substitute_invalid_input_byte (is);
435 /* Propagate end-of-file or error to caller. */
441 return original_size - size;
444 /* Reads up to SIZE bytes of UTF-8 text from IS into BUFFER. Returns the
445 number of bytes read if successful, 0 at end of file, or -1 if an error
446 occurred before any data could be read. Upon error, sets errno to an
447 appropriate value. */
449 u8_istream_read (struct u8_istream *is, char *buffer, size_t size)
454 return read_convert (is, convert_iconv, buffer, size);
457 return read_auto (is, buffer, size);
460 return read_convert (is, convert_utf8, buffer, size);
466 /* Returns the file descriptor underlying IS. */
468 u8_istream_fileno (const struct u8_istream *is)
475 These functions are probably useful only for white-box testing. */
477 /* Returns true if the encoding of the file being read by IS is not yet
480 u8_istream_is_auto (const struct u8_istream *is)
482 return is->state == S_AUTO;
485 /* Returns true if the encoding of the file being read by IS has been
486 determined to be UTF-8. */
488 u8_istream_is_utf8 (const struct u8_istream *is)
490 return is->state == S_UTF8;