1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2011 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include "striconveh.h"
33 #include "c-strcase.h"
34 #include "c-strcaseeq.h"
37 # define SIZE_MAX ((size_t) -1)
43 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
44 conversion error occurs, we may have to determine the Unicode representation
45 of the inconvertible character. */
48 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
54 /* Avoid glibc-2.1 bug with EUC-KR. */
55 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
56 && !defined _LIBICONV_VERSION
57 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
58 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
65 cd = iconv_open (to_codeset, from_codeset);
67 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
71 cd1 = iconv_open ("UTF-8", from_codeset);
72 if (cd1 == (iconv_t)(-1))
74 int saved_errno = errno;
75 if (cd != (iconv_t)(-1))
76 iconv_close (cdp->cd);
82 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
83 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
84 && !defined __UCLIBC__) \
85 || _LIBICONV_VERSION >= 0x0105
86 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
92 cd2 = iconv_open (to_codeset, "UTF-8");
93 if (cd2 == (iconv_t)(-1))
95 int saved_errno = errno;
96 if (cd1 != (iconv_t)(-1))
98 if (cd != (iconv_t)(-1))
112 iconveh_close (const iconveh_t *cd)
114 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
116 /* Return -1, but preserve the errno from iconv_close. */
117 int saved_errno = errno;
118 if (cd->cd1 != (iconv_t)(-1))
119 iconv_close (cd->cd1);
120 if (cd->cd != (iconv_t)(-1))
121 iconv_close (cd->cd);
125 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
127 /* Return -1, but preserve the errno from iconv_close. */
128 int saved_errno = errno;
129 if (cd->cd != (iconv_t)(-1))
130 iconv_close (cd->cd);
134 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
139 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
140 a conversion error, and it returns in *INCREMENTED a boolean telling whether
141 it has incremented the input pointers past the error location. */
142 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
143 /* Irix iconv() inserts a NUL byte if it cannot convert.
144 NetBSD iconv() inserts a question mark if it cannot convert.
145 Only GNU libiconv and GNU libc are known to prefer to fail rather
146 than doing a lossy conversion. */
148 iconv_carefully (iconv_t cd,
149 const char **inbuf, size_t *inbytesleft,
150 char **outbuf, size_t *outbytesleft,
153 const char *inptr = *inbuf;
154 const char *inptr_end = inptr + *inbytesleft;
155 char *outptr = *outbuf;
156 size_t outsize = *outbytesleft;
157 const char *inptr_before;
164 inptr_before = inptr;
167 for (insize = 1; inptr + insize <= inptr_end; insize++)
170 (ICONV_CONST char **) &inptr, &insize,
172 if (!(res == (size_t)(-1) && errno == EINVAL))
174 /* iconv can eat up a shift sequence but give EINVAL while attempting
175 to convert the first character. E.g. libiconv does this. */
176 if (inptr > inptr_before)
186 *outbytesleft = outsize;
189 while (res == 0 && inptr < inptr_end);
192 *inbytesleft = inptr_end - inptr;
193 if (res != (size_t)(-1) && res > 0)
195 /* iconv() has already incremented INPTR. We cannot go back to a
196 previous INPTR, otherwise the state inside CD would become invalid,
197 if FROM_CODESET is a stateful encoding. So, tell the caller that
198 *INBUF has already been incremented. */
199 *incremented = (inptr > inptr_before);
205 *incremented = false;
210 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
211 (*(incremented) = false, \
212 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
215 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
216 converting one character or one shift sequence. */
218 iconv_carefully_1 (iconv_t cd,
219 const char **inbuf, size_t *inbytesleft,
220 char **outbuf, size_t *outbytesleft,
223 const char *inptr_before = *inbuf;
224 const char *inptr = inptr_before;
225 const char *inptr_end = inptr_before + *inbytesleft;
226 char *outptr = *outbuf;
227 size_t outsize = *outbytesleft;
228 size_t res = (size_t)(-1);
231 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
233 inptr = inptr_before;
235 (ICONV_CONST char **) &inptr, &insize,
237 if (!(res == (size_t)(-1) && errno == EINVAL))
239 /* iconv can eat up a shift sequence but give EINVAL while attempting
240 to convert the first character. E.g. libiconv does this. */
241 if (inptr > inptr_before)
249 *inbytesleft = inptr_end - inptr;
250 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
251 /* Irix iconv() inserts a NUL byte if it cannot convert.
252 NetBSD iconv() inserts a question mark if it cannot convert.
253 Only GNU libiconv and GNU libc are known to prefer to fail rather
254 than doing a lossy conversion. */
255 if (res != (size_t)(-1) && res > 0)
257 /* iconv() has already incremented INPTR. We cannot go back to a
258 previous INPTR, otherwise the state inside CD would become invalid,
259 if FROM_CODESET is a stateful encoding. So, tell the caller that
260 *INBUF has already been incremented. */
261 *incremented = (inptr > inptr_before);
267 if (res != (size_t)(-1))
270 *outbytesleft = outsize;
272 *incremented = false;
276 /* utf8conv_carefully is like iconv, except that
277 - it converts from UTF-8 to UTF-8,
278 - it stops as soon as it encounters a conversion error, and it returns
279 in *INCREMENTED a boolean telling whether it has incremented the input
280 pointers past the error location,
281 - if one_character_only is true, it stops after converting one
284 utf8conv_carefully (bool one_character_only,
285 const char **inbuf, size_t *inbytesleft,
286 char **outbuf, size_t *outbytesleft,
289 const char *inptr = *inbuf;
290 size_t insize = *inbytesleft;
291 char *outptr = *outbuf;
292 size_t outsize = *outbytesleft;
302 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
305 errno = (n == -2 ? EINVAL : EILSEQ);
306 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
317 *incremented = false;
320 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
325 *incremented = false;
340 while (!one_character_only && insize > 0);
343 *inbytesleft = insize;
345 *outbytesleft = outsize;
350 mem_cd_iconveh_internal (const char *src, size_t srclen,
351 iconv_t cd, iconv_t cd1, iconv_t cd2,
352 enum iconv_ilseq_handler handler,
355 char **resultp, size_t *lengthp)
357 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
358 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
359 Instead, we have to start afresh from the beginning of SRC. */
360 /* Use a temporary buffer, so that for small strings, a single malloc()
361 call will be sufficient. */
362 # define tmpbufsize 4096
363 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
364 libiconv's UCS-4-INTERNAL encoding. */
365 union { unsigned int align; char buf[tmpbufsize]; } tmp;
366 # define tmpbuf tmp.buf
368 char *initial_result;
372 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
374 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
376 initial_result = *resultp;
377 allocated = *lengthp;
381 initial_result = tmpbuf;
382 allocated = sizeof (tmpbuf);
384 result = initial_result;
386 /* Test whether a direct conversion is possible at all. */
387 if (cd == (iconv_t)(-1))
394 for (i = 0; i < srclen; i++)
395 offsets[i] = (size_t)(-1);
397 last_length = (size_t)(-1);
401 /* First, try a direct conversion, and see whether a conversion error
404 const char *inptr = src;
405 size_t insize = srclen;
407 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
408 # if defined _LIBICONV_VERSION \
409 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
411 /* Set to the initial state. */
412 iconv (cd, NULL, NULL, NULL, NULL);
417 char *outptr = result + length;
418 size_t outsize = allocated - extra_alloc - length;
425 if (length != last_length) /* ensure that offset[] be increasing */
427 offsets[inptr - src] = length;
428 last_length = length;
430 res = iconv_carefully_1 (cd,
436 /* Use iconv_carefully instead of iconv here, because:
437 - If TO_CODESET is UTF-8, we can do the error handling in this
438 loop, no need for a second loop,
439 - With iconv() implementations other than GNU libiconv and GNU
440 libc, if we use iconv() in a big swoop, checking for an E2BIG
441 return, we lose the number of irreversible conversions. */
442 res = iconv_carefully (cd,
447 length = outptr - result;
448 grow = (length + extra_alloc > allocated / 2);
449 if (res == (size_t)(-1))
453 else if (errno == EINVAL)
455 else if (errno == EILSEQ && handler != iconveh_error)
457 if (cd2 == (iconv_t)(-1))
459 /* TO_CODESET is UTF-8. */
460 /* Error handling can produce up to 1 byte of output. */
461 if (length + 1 + extra_alloc > allocated)
465 allocated = 2 * allocated;
466 if (length + 1 + extra_alloc > allocated)
468 if (result == initial_result)
469 memory = (char *) malloc (allocated);
471 memory = (char *) realloc (result, allocated);
474 if (result != initial_result)
479 if (result == initial_result)
480 memcpy (memory, initial_result, length);
484 /* The input is invalid in FROM_CODESET. Eat up one byte
485 and emit a question mark. */
493 result[length] = '?';
501 if (result != initial_result)
503 int saved_errno = errno;
516 allocated = 2 * allocated;
517 if (result == initial_result)
518 memory = (char *) malloc (allocated);
520 memory = (char *) realloc (result, allocated);
523 if (result != initial_result)
528 if (result == initial_result)
529 memcpy (memory, initial_result, length);
535 /* Now get the conversion state back to the initial state.
536 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
537 #if defined _LIBICONV_VERSION \
538 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
542 char *outptr = result + length;
543 size_t outsize = allocated - extra_alloc - length;
546 res = iconv (cd, NULL, NULL, &outptr, &outsize);
547 length = outptr - result;
548 if (res == (size_t)(-1))
554 allocated = 2 * allocated;
555 if (result == initial_result)
556 memory = (char *) malloc (allocated);
558 memory = (char *) realloc (result, allocated);
561 if (result != initial_result)
566 if (result == initial_result)
567 memcpy (memory, initial_result, length);
572 if (result != initial_result)
574 int saved_errno = errno;
586 /* The direct conversion succeeded. */
590 /* The direct conversion failed.
591 Use a conversion through UTF-8. */
596 for (i = 0; i < srclen; i++)
597 offsets[i] = (size_t)(-1);
599 last_length = (size_t)(-1);
603 const bool slowly = (offsets != NULL || handler == iconveh_error);
604 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
605 char utf8buf[utf8bufsize + 1];
607 const char *in1ptr = src;
608 size_t in1size = srclen;
609 bool do_final_flush1 = true;
610 bool do_final_flush2 = true;
612 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
613 # if defined _LIBICONV_VERSION \
614 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
616 /* Set to the initial state. */
617 if (cd1 != (iconv_t)(-1))
618 iconv (cd1, NULL, NULL, NULL, NULL);
619 if (cd2 != (iconv_t)(-1))
620 iconv (cd2, NULL, NULL, NULL, NULL);
623 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
625 char *out1ptr = utf8buf + utf8len;
626 size_t out1size = utf8bufsize - utf8len;
631 /* Conversion step 1: from FROM_CODESET to UTF-8. */
635 && length != last_length) /* ensure that offset[] be increasing */
637 offsets[in1ptr - src] = length;
638 last_length = length;
640 if (cd1 != (iconv_t)(-1))
643 res1 = iconv_carefully_1 (cd1,
648 res1 = iconv_carefully (cd1,
655 /* FROM_CODESET is UTF-8. */
656 res1 = utf8conv_carefully (slowly,
662 else if (do_final_flush1)
664 /* Now get the conversion state of CD1 back to the initial state.
665 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
666 # if defined _LIBICONV_VERSION \
667 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
669 if (cd1 != (iconv_t)(-1))
670 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
674 do_final_flush1 = false;
682 if (res1 == (size_t)(-1)
683 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
685 if (result != initial_result)
687 int saved_errno = errno;
693 if (res1 == (size_t)(-1)
694 && errno == EILSEQ && handler != iconveh_error)
696 /* The input is invalid in FROM_CODESET. Eat up one byte and
697 emit a question mark. Room for the question mark was allocated
698 at the end of utf8buf. */
710 utf8len = out1ptr - utf8buf;
714 || utf8len > utf8bufsize / 2
715 || (res1 == (size_t)(-1) && errno1 == E2BIG))
717 /* Conversion step 2: from UTF-8 to TO_CODESET. */
718 const char *in2ptr = utf8buf;
719 size_t in2size = utf8len;
722 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
724 char *out2ptr = result + length;
725 size_t out2size = allocated - extra_alloc - length;
732 if (cd2 != (iconv_t)(-1))
733 res2 = iconv_carefully (cd2,
738 /* TO_CODESET is UTF-8. */
739 res2 = utf8conv_carefully (false,
744 else /* in1size == 0 && !do_final_flush1
745 && in2size == 0 && do_final_flush2 */
747 /* Now get the conversion state of CD1 back to the initial
748 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
749 # if defined _LIBICONV_VERSION \
750 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
752 if (cd2 != (iconv_t)(-1))
753 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
757 do_final_flush2 = false;
761 length = out2ptr - result;
762 grow = (length + extra_alloc > allocated / 2);
763 if (res2 == (size_t)(-1))
767 else if (errno == EINVAL)
769 else if (errno == EILSEQ && handler != iconveh_error)
771 /* Error handling can produce up to 10 bytes of ASCII
772 output. But TO_CODESET may be UCS-2, UTF-16 or
773 UCS-4, so use CD2 here as well. */
783 if (u8_prev (&uc, (const uint8_t *) in2ptr,
784 (const uint8_t *) utf8buf)
793 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
799 if (handler == iconveh_escape_sequence)
801 static char hex[16] = "0123456789ABCDEF";
803 scratchbuf[scratchlen++] = '\\';
805 scratchbuf[scratchlen++] = 'u';
808 scratchbuf[scratchlen++] = 'U';
809 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
810 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
811 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
812 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
814 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
815 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
816 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
817 scratchbuf[scratchlen++] = hex[uc & 15];
827 if (cd2 != (iconv_t)(-1))
829 (ICONV_CONST char **) &inptr, &insize,
830 &out2ptr, &out2size);
833 /* TO_CODESET is UTF-8. */
834 if (out2size >= insize)
836 memcpy (out2ptr, inptr, insize);
849 length = out2ptr - result;
850 if (res == (size_t)(-1) && errno == E2BIG)
854 allocated = 2 * allocated;
855 if (length + 1 + extra_alloc > allocated)
857 if (result == initial_result)
858 memory = (char *) malloc (allocated);
860 memory = (char *) realloc (result, allocated);
863 if (result != initial_result)
868 if (result == initial_result)
869 memcpy (memory, initial_result, length);
873 out2ptr = result + length;
874 out2size = allocated - extra_alloc - length;
875 if (cd2 != (iconv_t)(-1))
877 (ICONV_CONST char **) &inptr,
879 &out2ptr, &out2size);
882 /* TO_CODESET is UTF-8. */
883 if (!(out2size >= insize))
885 memcpy (out2ptr, inptr, insize);
892 length = out2ptr - result;
894 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
895 /* Irix iconv() inserts a NUL byte if it cannot convert.
896 NetBSD iconv() inserts a question mark if it cannot
898 Only GNU libiconv and GNU libc are known to prefer
899 to fail rather than doing a lossy conversion. */
900 if (res != (size_t)(-1) && res > 0)
906 if (res == (size_t)(-1))
908 /* Failure converting the ASCII replacement. */
909 if (result != initial_result)
911 int saved_errno = errno;
920 if (result != initial_result)
922 int saved_errno = errno;
930 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
936 allocated = 2 * allocated;
937 if (result == initial_result)
938 memory = (char *) malloc (allocated);
940 memory = (char *) realloc (result, allocated);
943 if (result != initial_result)
948 if (result == initial_result)
949 memcpy (memory, initial_result, length);
954 /* Move the remaining bytes to the beginning of utf8buf. */
956 memmove (utf8buf, in2ptr, in2size);
960 if (res1 == (size_t)(-1))
962 if (errno1 == EINVAL)
964 else if (errno1 == EILSEQ)
966 if (result != initial_result)
977 /* Now the final memory allocation. */
978 if (result == tmpbuf)
980 size_t memsize = length + extra_alloc;
982 if (*resultp != NULL && *lengthp >= memsize)
988 memory = (char *) malloc (memsize > 0 ? memsize : 1);
997 memcpy (result, tmpbuf, length);
999 else if (result != *resultp && length + extra_alloc < allocated)
1001 /* Shrink the allocated memory if possible. */
1002 size_t memsize = length + extra_alloc;
1005 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1017 mem_cd_iconveh (const char *src, size_t srclen,
1018 const iconveh_t *cd,
1019 enum iconv_ilseq_handler handler,
1021 char **resultp, size_t *lengthp)
1023 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1024 handler, 0, offsets, resultp, lengthp);
1028 str_cd_iconveh (const char *src,
1029 const iconveh_t *cd,
1030 enum iconv_ilseq_handler handler)
1032 /* For most encodings, a trailing NUL byte in the input will be converted
1033 to a trailing NUL byte in the output. But not for UTF-7. So that this
1034 function is usable for UTF-7, we have to exclude the NUL byte from the
1035 conversion and add it by hand afterwards. */
1036 char *result = NULL;
1038 int retval = mem_cd_iconveh_internal (src, strlen (src),
1039 cd->cd, cd->cd1, cd->cd2, handler, 1,
1040 NULL, &result, &length);
1046 int saved_errno = errno;
1048 errno = saved_errno;
1053 /* Add the terminating NUL byte. */
1054 result[length] = '\0';
1062 mem_iconveh (const char *src, size_t srclen,
1063 const char *from_codeset, const char *to_codeset,
1064 enum iconv_ilseq_handler handler,
1066 char **resultp, size_t *lengthp)
1070 /* Nothing to convert. */
1074 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1078 if (*resultp != NULL && *lengthp >= srclen)
1082 result = (char *) malloc (srclen);
1089 memcpy (result, src, srclen);
1102 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1107 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1112 /* Close cd, but preserve the errno from str_cd_iconv. */
1113 int saved_errno = errno;
1114 iconveh_close (&cd);
1115 errno = saved_errno;
1119 if (iconveh_close (&cd) < 0)
1121 /* Return -1, but free the allocated memory, and while doing
1122 that, preserve the errno from iconveh_close. */
1123 int saved_errno = errno;
1124 if (result != *resultp && result != NULL)
1126 errno = saved_errno;
1134 /* This is a different error code than if iconv_open existed but didn't
1135 support from_codeset and to_codeset, so that the caller can emit
1136 an error message such as
1137 "iconv() is not supported. Installing GNU libiconv and
1138 then reinstalling this package would fix this." */
1146 str_iconveh (const char *src,
1147 const char *from_codeset, const char *to_codeset,
1148 enum iconv_ilseq_handler handler)
1150 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1152 char *result = strdup (src);
1164 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1167 result = str_cd_iconveh (src, &cd, handler);
1171 /* Close cd, but preserve the errno from str_cd_iconv. */
1172 int saved_errno = errno;
1173 iconveh_close (&cd);
1174 errno = saved_errno;
1178 if (iconveh_close (&cd) < 0)
1180 /* Return NULL, but free the allocated memory, and while doing
1181 that, preserve the errno from iconveh_close. */
1182 int saved_errno = errno;
1184 errno = saved_errno;
1190 /* This is a different error code than if iconv_open existed but didn't
1191 support from_codeset and to_codeset, so that the caller can emit
1192 an error message such as
1193 "iconv() is not supported. Installing GNU libiconv and
1194 then reinstalling this package would fix this." */