From c6ad67bb80efa455e52904f98af0c8c4ec4f36ee Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Tue, 16 Jan 2007 03:47:23 +0000 Subject: [PATCH] New module 'striconveh'. --- ChangeLog | 7 + MODULES.html.sh | 1 + lib/striconveh.c | 881 +++++++++++++++++++++++++++++++++++++++++++++ lib/striconveh.h | 99 +++++ modules/striconveh | 38 ++ 5 files changed, 1026 insertions(+) create mode 100644 lib/striconveh.c create mode 100644 lib/striconveh.h create mode 100644 modules/striconveh diff --git a/ChangeLog b/ChangeLog index 1e0ca0d389..ed3916dcb0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2007-01-15 Bruno Haible + + * modules/striconveh: New file. + * lib/striconveh.h: New file. + * lib/striconveh.c: New file. + * MODULES.html.sh (Internationalization functions): Add striconveh. + 2007-01-15 Bruno Haible * lib/striconv.c (str_cd_iconv): Use the first algorithm if iconv is diff --git a/MODULES.html.sh b/MODULES.html.sh index 68a3152ba8..d1a66844a3 100755 --- a/MODULES.html.sh +++ b/MODULES.html.sh @@ -2142,6 +2142,7 @@ func_all_modules () func_module iconv func_module striconv func_module xstriconv + func_module striconveh func_module iconvme func_module localcharset func_module hard-locale diff --git a/lib/striconveh.c b/lib/striconveh.c new file mode 100644 index 0000000000..9e916e656c --- /dev/null +++ b/lib/striconveh.c @@ -0,0 +1,881 @@ +/* Character set conversion with error handling. + Copyright (C) 2001-2007 Free Software Foundation, Inc. + Written by Bruno Haible and Simon Josefsson. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#include + +/* Specification. */ +#include "striconveh.h" + +#include +#include +#include +#include + +#if HAVE_ICONV +# include +# include "utf8-ucs4-safe.h" +# include "ucs4-utf8.h" +# include "unistr.h" +#endif + +#include "strdup.h" +#include "c-strcase.h" + +#ifndef SIZE_MAX +# define SIZE_MAX ((size_t) -1) +#endif + + +#if HAVE_ICONV + +/* The caller must provide CD, CD1, CD2, not just CD, because when a conversion + error occurs, we may have to determine the Unicode representation of the + inconvertible character. */ + +/* iconv_carefully is like iconv, except that it stops as soon as it encounters + a conversion error, and it returns in *INCREMENTED a boolean telling whether + it has incremented the input pointers past the error location. */ +# if !defined _LIBICONV_VERSION && !defined __GLIBC__ +/* Irix iconv() inserts a NUL byte if it cannot convert. + NetBSD iconv() inserts a question mark if it cannot convert. + Only GNU libiconv and GNU libc are known to prefer to fail rather + than doing a lossy conversion. */ +static size_t +iconv_carefully (iconv_t cd, + const char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft, + bool *incremented) +{ + const char *inptr = *inbuf; + const char *inptr_end = inptr + *inbytesleft; + char *outptr = *outbuf; + size_t outsize = *outbytesleft; + const char *inptr_before; + size_t res; + + do + { + size_t insize; + + inptr_before = inptr; + res = (size_t)(-1); + + for (insize = 1; inptr + insize <= inptr_end; insize++) + { + res = iconv (cd, + (ICONV_CONST char **) &inptr, &insize, + &outptr, &outsize); + if (!(res == (size_t)(-1) && errno == EINVAL)) + break; + /* We expect that no input bytes have been consumed so far. */ + if (inptr != inptr_before) + abort (); + } + + if (res == 0) + { + *outbuf = outptr; + *outbytesleft = outsize; + } + } + while (res == 0 && inptr < inptr_end); + + *inbuf = inptr; + *inbytesleft = inptr_end - inptr; + if (res != (size_t)(-1) && res > 0) + { + /* iconv() has already incremented INPTR. We cannot go back to a + previous INPTR, otherwise the state inside CD would become invalid, + if FROM_CODESET is a stateful encoding. So, tell the caller that + *INBUF has already been incremented. */ + *incremented = (inptr > inptr_before); + errno = EILSEQ; + return (size_t)(-1); + } + else + { + *incremented = false; + return res; + } +} +# else +# define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \ + (*(incremented) = false, \ + iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft)) +# endif + +static int +mem_cd_iconveh_internal (const char *src, size_t srclen, + iconv_t cd, iconv_t cd1, iconv_t cd2, + enum iconv_ilseq_handler handler, + size_t extra_alloc, + char **resultp, size_t *lengthp) +{ + /* When a conversion error occurs, we cannot start using CD1 and CD2 at + this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR. + Instead, we have to start afresh from the beginning of SRC. */ + /* Use a temporary buffer, so that for small strings, a single malloc() + call will be sufficient. */ +# define tmpbufsize 4096 + /* The alignment is needed when converting e.g. to glibc's WCHAR_T or + libiconv's UCS-4-INTERNAL encoding. */ + union { unsigned int align; char buf[tmpbufsize]; } tmp; +# define tmpbuf tmp.buf + + char *result = tmpbuf; + size_t allocated = sizeof (tmpbuf); + size_t length = 0; + + /* First, try a direct conversion, and see whether a conversion error + occurs at all. */ + { + const char *inptr = src; + size_t insize = srclen; + + /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ +# if defined _LIBICONV_VERSION \ + || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) + /* Set to the initial state. */ + iconv (cd, NULL, NULL, NULL, NULL); +# endif + + while (insize > 0) + { + char *outptr = result + length; + size_t outsize = allocated - extra_alloc - length; + bool incremented; + size_t res; + bool grow; + + /* Use iconv_carefully instead of iconv here, because: + - If TO_CODESET is UTF-8, we can do the error handling in this loop, + no need for a second loop, + - With iconv() implementations other than GNU libiconv and GNU libc, + if we use iconv() in a big swoop, checking for an E2BIG return, + we lose the number of irreversible conversions. */ + res = iconv_carefully (cd, + &inptr, &insize, + &outptr, &outsize, + &incremented); + + length = outptr - result; + grow = (length + extra_alloc > allocated / 2); + if (res == (size_t)(-1)) + { + if (errno == E2BIG) + grow = true; + else if (errno == EINVAL) + break; + else if (errno == EILSEQ && handler != iconveh_error) + { + if (cd2 == (iconv_t)(-1)) + { + /* TO_CODESET is UTF-8. */ + /* Error handling can produce up to 1 byte of output. */ + if (length + 1 + extra_alloc > allocated) + { + char *memory; + + allocated = 2 * allocated; + if (length + 1 + extra_alloc > allocated) + abort (); + if (result == tmpbuf) + memory = (char *) malloc (allocated); + else + memory = (char *) realloc (result, allocated); + if (memory == NULL) + { + if (result != tmpbuf) + free (result); + errno = ENOMEM; + return -1; + } + if (result == tmpbuf) + memcpy (memory, tmpbuf, length); + result = memory; + grow = false; + } + /* The input is invalid in FROM_CODESET. Eat up one byte + and emit a question mark. */ + if (!incremented) + { + if (insize == 0) + abort (); + inptr++; + insize--; + } + result[length] = '?'; + length++; + } + else + goto indirectly; + } + else + { + if (result != tmpbuf) + { + int saved_errno = errno; + free (result); + errno = saved_errno; + } + return -1; + } + } + if (insize == 0) + break; + if (grow) + { + char *memory; + + allocated = 2 * allocated; + if (result == tmpbuf) + memory = (char *) malloc (allocated); + else + memory = (char *) realloc (result, allocated); + if (memory == NULL) + { + if (result != tmpbuf) + free (result); + errno = ENOMEM; + return -1; + } + if (result == tmpbuf) + memcpy (memory, tmpbuf, length); + result = memory; + } + } + } + + /* Now get the conversion state back to the initial state. + But avoid glibc-2.1 bug and Solaris 2.7 bug. */ +#if defined _LIBICONV_VERSION \ + || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun) + for (;;) + { + char *outptr = result + length; + size_t outsize = allocated - extra_alloc - length; + size_t res; + + res = iconv (cd, NULL, NULL, &outptr, &outsize); + length = outptr - result; + if (res == (size_t)(-1)) + { + if (errno == E2BIG) + { + char *memory; + + allocated = 2 * allocated; + if (result == tmpbuf) + memory = (char *) malloc (allocated); + else + memory = (char *) realloc (result, allocated); + if (memory == NULL) + { + if (result != tmpbuf) + free (result); + errno = ENOMEM; + return -1; + } + if (result == tmpbuf) + memcpy (memory, tmpbuf, length); + result = memory; + } + else + { + if (result != tmpbuf) + { + int saved_errno = errno; + free (result); + errno = saved_errno; + } + return -1; + } + } + else + break; + } +#endif + + /* The direct conversion succeeded. */ + goto done; + + indirectly: + /* The direct conversion failed, handler != iconveh_error, + and cd2 != (iconv_t)(-1). + Use a conversion through UTF-8. */ + length = 0; + { +# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */ + char utf8buf[utf8bufsize + 1]; + size_t utf8len = 0; + const char *in1ptr = src; + size_t in1size = srclen; + bool do_final_flush1 = true; + bool do_final_flush2 = true; + + /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ +# if defined _LIBICONV_VERSION \ + || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) + /* Set to the initial state. */ + if (cd1 != (iconv_t)(-1)) + iconv (cd1, NULL, NULL, NULL, NULL); + iconv (cd2, NULL, NULL, NULL, NULL); +# endif + + while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2) + { + char *out1ptr = utf8buf + utf8len; + size_t out1size = utf8bufsize - utf8len; + bool incremented1; + size_t res1; + int errno1; + + /* Conversion step 1: from FROM_CODESET to UTF-8. */ + if (in1size > 0) + { + if (cd1 != (iconv_t)(-1)) + res1 = iconv_carefully (cd1, + (ICONV_CONST char **) &in1ptr, &in1size, + &out1ptr, &out1size, + &incremented1); + else + { + /* FROM_CODESET is UTF-8. */ + res1 = 0; + do + { + ucs4_t uc; + int n; + int m; + + n = u8_mbtouc_safe (&uc, (const uint8_t *) in1ptr, in1size); + if (uc == 0xfffd + && !(n >= 3 + && (uint8_t)in1ptr[0] == 0xEF + && (uint8_t)in1ptr[1] == 0xBF + && (uint8_t)in1ptr[2] == 0xBD)) + { + in1ptr += n; + in1size -= n; + errno = EILSEQ; + res1 = (size_t)(-1); + incremented1 = true; + break; + } + if (out1size == 0) + { + errno = E2BIG; + res1 = (size_t)(-1); + incremented1 = false; + break; + } + m = u8_uctomb ((uint8_t *) out1ptr, uc, out1size); + if (m == -2) + { + errno = E2BIG; + res1 = (size_t)(-1); + incremented1 = false; + break; + } + in1ptr += n; + in1size -= n; + if (m == -1) + { + errno = EILSEQ; + res1 = (size_t)(-1); + incremented1 = true; + break; + } + out1ptr += m; + out1size -= m; + } + while (in1size > 0); + } + } + else if (do_final_flush1) + { + /* Now get the conversion state of CD1 back to the initial state. + But avoid glibc-2.1 bug and Solaris 2.7 bug. */ +# if defined _LIBICONV_VERSION \ + || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun) + if (cd1 != (iconv_t)(-1)) + res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size); + else +# endif + res1 = 0; + do_final_flush1 = false; + incremented1 = true; + } + else + { + res1 = 0; + incremented1 = true; + } + if (res1 == (size_t)(-1) + && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ)) + { + if (result != tmpbuf) + { + int saved_errno = errno; + free (result); + errno = saved_errno; + } + return -1; + } + if (res1 == (size_t)(-1) + && errno == EILSEQ && handler != iconveh_error) + { + /* The input is invalid in FROM_CODESET. Eat up one byte and + emit a question mark. Room for the question mark was allocated + at the end of utf8buf. */ + if (!incremented1) + { + if (in1size == 0) + abort (); + in1ptr++; + in1size--; + } + utf8buf[utf8len++] = '?'; + } + errno1 = errno; + utf8len = out1ptr - utf8buf; + + if (in1size == 0 + || utf8len > utf8bufsize / 2 + || (res1 == (size_t)(-1) && errno1 == E2BIG)) + { + /* Conversion step 2: from UTF-8 to TO_CODESET. */ + const char *in2ptr = utf8buf; + size_t in2size = utf8len; + + while (in2size > 0 + || (in1size == 0 && !do_final_flush1 && do_final_flush2)) + { + char *out2ptr = result + length; + size_t out2size = allocated - extra_alloc - length; + bool incremented2; + size_t res2; + bool grow; + + if (in2size > 0) + res2 = iconv_carefully (cd2, + &in2ptr, &in2size, + &out2ptr, &out2size, + &incremented2); + else /* in1size == 0 && !do_final_flush1 + && in2size == 0 && do_final_flush2 */ + { + /* Now get the conversion state of CD1 back to the initial + state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */ +# if defined _LIBICONV_VERSION \ + || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun) + res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size); +# else + res2 = 0; +# endif + do_final_flush2 = false; + incremented2 = true; + } + + length = out2ptr - result; + grow = (length + extra_alloc > allocated / 2); + if (res2 == (size_t)(-1)) + { + if (errno == E2BIG) + grow = true; + else if (errno == EINVAL) + break; + else if (errno == EILSEQ && handler != iconveh_error) + { + /* Error handling can produce up to 10 bytes of ASCII + output. But TO_CODESET may be UCS-2, UTF-16 or + UCS-4, so use CD2 here as well. */ + char scratchbuf[10]; + size_t scratchlen; + ucs4_t uc; + const char *inptr; + size_t insize; + size_t res; + + if (incremented2) + { + if (u8_prev (&uc, (const uint8_t *) in2ptr, + (const uint8_t *) utf8buf) + == NULL) + abort (); + } + else + { + int n; + if (in2size == 0) + abort (); + n = u8_mbtouc (&uc, (const uint8_t *) in2ptr, + in2size); + in2ptr += n; + in2size -= n; + } + + if (handler == iconveh_escape_sequence) + { + static char hex[16] = "0123456789ABCDEF"; + scratchlen = 0; + scratchbuf[scratchlen++] = '\\'; + if (uc < 0x10000) + scratchbuf[scratchlen++] = 'u'; + else + { + scratchbuf[scratchlen++] = 'U'; + scratchbuf[scratchlen++] = hex[(uc>>28) & 15]; + scratchbuf[scratchlen++] = hex[(uc>>24) & 15]; + scratchbuf[scratchlen++] = hex[(uc>>20) & 15]; + scratchbuf[scratchlen++] = hex[(uc>>16) & 15]; + } + scratchbuf[scratchlen++] = hex[(uc>>12) & 15]; + scratchbuf[scratchlen++] = hex[(uc>>8) & 15]; + scratchbuf[scratchlen++] = hex[(uc>>4) & 15]; + scratchbuf[scratchlen++] = hex[uc & 15]; + } + else + { + scratchbuf[0] = '?'; + scratchlen = 1; + } + + inptr = scratchbuf; + insize = scratchlen; + res = iconv (cd2, + (ICONV_CONST char **) &inptr, &insize, + &out2ptr, &out2size); + length = out2ptr - result; + if (res == (size_t)(-1) && errno == E2BIG) + { + char *memory; + + allocated = 2 * allocated; + if (length + 1 + extra_alloc > allocated) + abort (); + if (result == tmpbuf) + memory = (char *) malloc (allocated); + else + memory = (char *) realloc (result, allocated); + if (memory == NULL) + { + if (result != tmpbuf) + free (result); + errno = ENOMEM; + return -1; + } + if (result == tmpbuf) + memcpy (memory, tmpbuf, length); + result = memory; + grow = false; + + out2ptr = result + length; + out2size = allocated - extra_alloc - length; + res = iconv (cd2, + (ICONV_CONST char **) &inptr, &insize, + &out2ptr, &out2size); + length = out2ptr - result; + } +# if !defined _LIBICONV_VERSION && !defined __GLIBC__ + /* Irix iconv() inserts a NUL byte if it cannot convert. + NetBSD iconv() inserts a question mark if it cannot + convert. + Only GNU libiconv and GNU libc are known to prefer + to fail rather than doing a lossy conversion. */ + if (res != (size_t)(-1) && res > 0) + { + errno = EILSEQ; + res = (size_t)(-1); + } +# endif + if (res == (size_t)(-1)) + { + /* Failure converting the ASCII replacement. */ + if (result != tmpbuf) + { + int saved_errno = errno; + free (result); + errno = saved_errno; + } + return -1; + } + } + else + { + if (result != tmpbuf) + { + int saved_errno = errno; + free (result); + errno = saved_errno; + } + return -1; + } + } + if (!(in2size > 0 + || (in1size == 0 && !do_final_flush1 && do_final_flush2))) + break; + if (grow) + { + char *memory; + + allocated = 2 * allocated; + if (result == tmpbuf) + memory = (char *) malloc (allocated); + else + memory = (char *) realloc (result, allocated); + if (memory == NULL) + { + if (result != tmpbuf) + free (result); + errno = ENOMEM; + return -1; + } + if (result == tmpbuf) + memcpy (memory, tmpbuf, length); + result = memory; + } + } + + /* Move the remaining bytes to the beginning of utf8buf. */ + if (in2size > 0) + memmove (utf8buf, in2ptr, in2size); + utf8len = in2size; + } + + if (res1 == (size_t)(-1)) + { + if (errno1 == EINVAL) + in1size = 0; + else if (errno1 == EILSEQ) + { + if (result != tmpbuf) + free (result); + errno = errno1; + return -1; + } + } + } +# undef utf8bufsize + } + + done: + /* Now the final memory allocation. */ + if (resultp != NULL) + { + if (result == tmpbuf) + { + char *memory; + + memory = (char *) malloc (length + extra_alloc); + if (memory != NULL) + { + memcpy (memory, tmpbuf, length); + result = memory; + } + else + { + errno = ENOMEM; + return -1; + } + } + else if (length + extra_alloc < allocated) + { + /* Shrink the allocated memory if possible. */ + char *memory; + + memory = (char *) realloc (result, length + extra_alloc); + if (memory != NULL) + result = memory; + } + *resultp = result; + } + else + { + if (result != tmpbuf) + free (result); + } + if (lengthp != NULL) + *lengthp = length; + return 0; +# undef tmpbuf +# undef tmpbufsize +} + +int +mem_cd_iconveh (const char *src, size_t srclen, + iconv_t cd, iconv_t cd1, iconv_t cd2, + enum iconv_ilseq_handler handler, + char **resultp, size_t *lengthp) +{ + return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0, + resultp, lengthp); +} + +char * +str_cd_iconveh (const char *src, + iconv_t cd, iconv_t cd1, iconv_t cd2, + enum iconv_ilseq_handler handler) +{ + /* For most encodings, a trailing NUL byte in the input will be converted + to a trailing NUL byte in the output. But not for UTF-7. So that this + function is usable for UTF-7, we have to exclude the NUL byte from the + conversion and add it by hand afterwards. */ + char *result = NULL; + size_t length; + int retval = mem_cd_iconveh_internal (src, strlen (src), + cd, cd1, cd2, handler, 1, + &result, &length); + + if (retval < 0) + { + if (result != NULL) + { + int saved_errno = errno; + free (result); + errno = saved_errno; + } + return NULL; + } + + /* Add the terminating NUL byte. */ + result[length] = '\0'; + + return result; +} + +#endif + +char * +str_iconveh (const char *src, + const char *from_codeset, const char *to_codeset, + enum iconv_ilseq_handler handler) +{ + if (c_strcasecmp (from_codeset, to_codeset) == 0) + return strdup (src); + else + { +#if HAVE_ICONV + iconv_t cd; + iconv_t cd1; + iconv_t cd2; + char *result; + + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION + if (c_strcasecmp (from_codeset, "EUC-KR") == 0 + || c_strcasecmp (to_codeset, "EUC-KR") == 0) + { + errno = EINVAL; + return NULL; + } +# endif + + cd = iconv_open (to_codeset, from_codeset); + if (cd == (iconv_t)(-1)) + return NULL; + + if (c_strcasecmp (from_codeset, "UTF-8") == 0) + cd1 = (iconv_t)(-1); + else + { + cd1 = iconv_open ("UTF-8", from_codeset); + if (cd1 == (iconv_t)(-1)) + { + int saved_errno = errno; + iconv_close (cd); + errno = saved_errno; + return NULL; + } + } + + if (c_strcasecmp (to_codeset, "UTF-8") == 0) + cd2 = (iconv_t)(-1); + else + { + cd2 = iconv_open (to_codeset, "UTF-8"); + if (cd2 == (iconv_t)(-1)) + { + int saved_errno = errno; + if (cd1 != (iconv_t)(-1)) + iconv_close (cd1); + iconv_close (cd); + errno = saved_errno; + return NULL; + } + } + + result = str_cd_iconveh (src, cd, cd1, cd2, handler); + + if (result == NULL) + { + /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */ + int saved_errno = errno; + if (cd2 != (iconv_t)(-1)) + iconv_close (cd2); + if (cd1 != (iconv_t)(-1)) + iconv_close (cd1); + iconv_close (cd); + errno = saved_errno; + } + else + { + if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0) + { + /* Return NULL, but free the allocated memory, and while doing + that, preserve the errno from iconv_close. */ + int saved_errno = errno; + if (cd1 != (iconv_t)(-1)) + iconv_close (cd1); + iconv_close (cd); + free (result); + errno = saved_errno; + return NULL; + } + if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0) + { + /* Return NULL, but free the allocated memory, and while doing + that, preserve the errno from iconv_close. */ + int saved_errno = errno; + iconv_close (cd); + free (result); + errno = saved_errno; + return NULL; + } + if (iconv_close (cd) < 0) + { + /* Return NULL, but free the allocated memory, and while doing + that, preserve the errno from iconv_close. */ + int saved_errno = errno; + free (result); + errno = saved_errno; + return NULL; + } + } + return result; +#else + /* This is a different error code than if iconv_open existed but didn't + support from_codeset and to_codeset, so that the caller can emit + an error message such as + "iconv() is not supported. Installing GNU libiconv and + then reinstalling this package would fix this." */ + errno = ENOSYS; + return NULL; +#endif + } +} diff --git a/lib/striconveh.h b/lib/striconveh.h new file mode 100644 index 0000000000..b528e5152f --- /dev/null +++ b/lib/striconveh.h @@ -0,0 +1,99 @@ +/* Character set conversion with error handling. + Copyright (C) 2001-2007 Free Software Foundation, Inc. + Written by Bruno Haible and Simon Josefsson. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef _STRICONVEH_H +#define _STRICONVEH_H + +#include +#if HAVE_ICONV +#include +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Handling of unconvertible characters. */ +enum iconv_ilseq_handler +{ + iconveh_error, /* return and set errno = EILSEQ */ + iconveh_question_mark, /* use one '?' per unconvertible character */ + iconveh_escape_sequence /* use escape sequence \uxxxx or \Uxxxxxxxx */ +}; + +#if HAVE_ICONV + +/* Convert an entire string from one encoding to another, using iconv. + The original string is at [SRC,...,SRC+SRCLEN-1]. + The conversion descriptor from FROMCODE to TOCODE is passed as CD. + CD1 is the conversion descriptor from FROM_CODESET to UTF-8 (or + (iconv_t)(-1) if FROM_CODESET is UTF-8). + CD2 is the conversion descriptor from UTF-8 to TO_CODESET (or (iconv_t)(-1) + if TO_CODESET is UTF-8). + *RESULTP should initially contain NULL or a malloced memory block. + May change the size of the allocated memory block in *RESULTP, storing + its new address in *RESULTP and its new length in *LENGTHP. + Return value: 0 if successful, otherwise -1 and errno set. + If successful, the resulting string is stored in *RESULTP and its length + in *LENGTHP. */ +extern int + mem_cd_iconveh (const char *src, size_t srclen, + iconv_t cd, iconv_t cd1, iconv_t cd2, + enum iconv_ilseq_handler handler, + char **resultp, size_t *lengthp); + +/* Convert an entire string from one encoding to another, using iconv. + The original string is the NUL-terminated string starting at SRC. + The conversion descriptor is passed as CD. Both the "from" and the "to" + encoding must use a single NUL byte at the end of the string (i.e. not + UCS-2, UCS-4, UTF-16, UTF-32). + CD1 is the conversion descriptor from FROM_CODESET to UTF-8 (or + (iconv_t)(-1) if FROM_CODESET is UTF-8). + CD2 is the conversion descriptor from UTF-8 to TO_CODESET (or (iconv_t)(-1) + if TO_CODESET is UTF-8). + Allocate a malloced memory block for the result. + Return value: the freshly allocated resulting NUL-terminated string if + successful, otherwise NULL and errno set. */ +extern char * + str_cd_iconveh (const char *src, + iconv_t cd, iconv_t cd1, iconv_t cd2, + enum iconv_ilseq_handler handler); + +#endif + +/* Convert an entire string from one encoding to another, using iconv. + The original string is the NUL-terminated string starting at SRC. + Both the "from" and the "to" encoding must use a single NUL byte at the + end of the string (i.e. not UCS-2, UCS-4, UTF-16, UTF-32). + Allocate a malloced memory block for the result. + Return value: the freshly allocated resulting NUL-terminated string if + successful, otherwise NULL and errno set. */ +extern char * + str_iconveh (const char *src, + const char *from_codeset, const char *to_codeset, + enum iconv_ilseq_handler handler); + + +#ifdef __cplusplus +} +#endif + + +#endif /* _STRICONVEH_H */ diff --git a/modules/striconveh b/modules/striconveh new file mode 100644 index 0000000000..e3649e0824 --- /dev/null +++ b/modules/striconveh @@ -0,0 +1,38 @@ +Description: +Character set conversion of strings with error handling, uses iconv. + +Files: +lib/striconveh.h +lib/striconveh.c + +Depends-on: +stdbool +iconv +utf8-ucs4-safe +ucs4-utf8 +unistr/u8-prev +unistr/u8-mbtouc +strdup +c-strcase + +configure.ac: +if test $gl_cond_libtool = false; then + gl_ltlibdeps="$gl_ltlibdeps $LTLIBICONV" + gl_libdeps="$gl_libdeps $LIBICONV" +fi + +Makefile.am: +lib_SOURCES += striconveh.h striconveh.c +if GL_COND_LIBTOOL +lib_LDFLAGS += $(LTLIBICONV) +endif + +Include: +"striconveh.h" + +License: +LGPL + +Maintainer: +Bruno Haible + -- 2.30.2