1 /* Unicode character output to streams with locale dependent encoding.
3 Copyright (C) 2000 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify it
6 under the terms of the GNU Library General Public License as published
7 by the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 /* Written by Bruno Haible <haible@clisp.cons.org>. */
46 /* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
47 #if HAVE_WCRTOMB && defined mbstate_t
48 # define wcrtomb(s, wc, ps) (wcrtomb) (s, wc, 0)
59 # define _(Text) gettext (Text)
64 #include "unicodeio.h"
66 #if __STDC_ISO_10646__ && HAVE_WCRTOMB
68 /* Values of type wchar_t are Unicode code points. */
70 /* Outputs the Unicode character CODE to the output stream STREAM.
71 Assumes that the locale doesn't change between two calls. */
73 print_unicode_char (FILE *stream, unsigned int code)
75 wchar_t wc = (wchar_t) code;
77 /* Test for truncation. */
80 /* Convert from wide character to multibyte representation. */
81 char buf[64]; /* Assume MB_LEN_MAX <= 64. */
85 memset (&state, 0, sizeof (mbstate_t));
86 res = wcrtomb (buf, wc, &state);
87 if (res != (size_t)(-1))
88 fwrite (buf, 1, res, stream);
91 _("cannot convert U+%04X to local character set"), code);
94 error (1, 0, _("cannot convert U+%04X to local character set"), code);
99 /* When we pass a Unicode character to iconv(), we must pass it in a
100 suitable encoding. The standardized Unicode encodings are
101 UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
102 UCS-2 supports only characters up to \U0000FFFF.
103 UTF-16 and variants support only characters up to \U0010FFFF.
104 UTF-7 is way too complex and not supported by glibc-2.1.
105 UCS-4 specification leaves doubts about endianness and byte order
106 mark. glibc currently interprets it as big endian without byte order
107 mark, but this is not backed by an RFC.
108 So we use UTF-8. It supports characters up to \U7FFFFFFF and is
109 unambiguously defined. */
111 /* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
112 Returns the number of bytes stored, or -1 if wc is out of range. */
114 utf8_wctomb (unsigned char *r, unsigned int wc)
122 else if (wc < 0x10000)
124 else if (wc < 0x200000)
126 else if (wc < 0x4000000)
128 else if (wc <= 0x7fffffff)
135 /* Note: code falls through cases! */
136 case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
137 case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
138 case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
139 case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
140 case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
147 /* Luckily, the encoding's name is platform independent. */
148 # define UTF8_NAME "UTF-8"
150 /* Outputs the Unicode character CODE to the output stream STREAM.
151 Assumes that the locale doesn't change between two calls. */
153 print_unicode_char (FILE *stream, unsigned int code)
155 static int initialized;
158 static iconv_t utf8_to_local;
166 extern const char *locale_charset PARAMS ((void));
167 const char *charset = locale_charset ();
169 is_utf8 = (charset != NULL && !strcmp (charset, UTF8_NAME));
173 utf8_to_local = (charset != NULL
174 ? iconv_open (charset, UTF8_NAME)
176 if (utf8_to_local == (iconv_t)(-1))
178 /* For an unknown encoding, assume ASCII. */
179 utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
180 if (utf8_to_local == (iconv_t)(-1))
182 _("cannot output U+%04X: iconv function not usable"),
190 /* Convert the character to UTF-8. */
191 count = utf8_wctomb ((unsigned char *) inbuf, code);
193 error (1, 0, _("U+%04X: character out of range"), code);
197 fwrite (inbuf, 1, count, stream);
212 outbytesleft = sizeof (outbuf);
214 /* Convert the character from UTF-8 to the locale's charset. */
215 res = iconv (utf8_to_local, &inptr, &inbytesleft, &outptr, &outbytesleft);
216 if (inbytesleft > 0 || res == (size_t)(-1)
217 /* Irix iconv() inserts a NUL byte if it cannot convert. */
218 # if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
219 || (res > 0 && code != 0 && outptr - outbuf == 1 && *outbuf == '\0')
222 error (1, res == (size_t)(-1) ? errno : 0,
223 _("cannot convert U+%04X to local character set"), code);
225 /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
226 # if defined _LIBICONV_VERSION \
227 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
229 /* Get back to the initial shift state. */
230 res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
231 if (res == (size_t)(-1))
232 error (1, errno, _("cannot convert U+%04X to local character set"),
236 fwrite (outbuf, 1, outptr - outbuf, stream);
238 error (1, 0, _("cannot output U+%04X: iconv function not available"),