pintos-os.org Git - pspp/blob - lib/unicodeio.c

   1 /* Unicode character output to streams with locale dependent encoding.
   2
   3    Copyright (C) 2000 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify it
   6    under the terms of the GNU Library General Public License as published
   7    by the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with this program; if not, write to the Free Software
  17    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  18    USA.  */
  19
  20 /* Written by Bruno Haible <haible@clisp.cons.org>.  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 # include <config.h>
  24 #endif
  25
  26 #if HAVE_STDDEF_H
  27 # include <stddef.h>
  28 #endif
  29
  30 #include <stdio.h>
  31 #if HAVE_STRING_H
  32 # include <string.h>
  33 #else
  34 # include <strings.h>
  35 #endif
  36
  37 #include <errno.h>
  38 #ifndef errno
  39 extern int errno;
  40 #endif
  41
  42 #if HAVE_WCHAR_H
  43 # include <wchar.h>
  44 #endif
  45
  46 /* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
  47 #if HAVE_WCRTOMB && defined mbstate_t
  48 # define wcrtomb(s, wc, ps) (wcrtomb) (s, wc, 0)
  49 #endif
  50
  51 #if HAVE_ICONV
  52 # include <iconv.h>
  53 #endif
  54
  55 #include <error.h>
  56
  57 #if ENABLE_NLS
  58 # include <libintl.h>
  59 # define _(Text) gettext (Text)
  60 #else
  61 # define _(Text) Text
  62 #endif
  63
  64 #include "unicodeio.h"
  65
  66 #if __STDC_ISO_10646__ && HAVE_WCRTOMB
  67
  68 /* Values of type wchar_t are Unicode code points.  */
  69
  70 /* Outputs the Unicode character CODE to the output stream STREAM.
  71    Assumes that the locale doesn't change between two calls.  */
  72 void
  73 print_unicode_char (FILE *stream, unsigned int code)
  74 {
  75   wchar_t wc = (wchar_t) code;
  76
  77   /* Test for truncation.  */
  78   if (wc == code)
  79     {
  80       /* Convert from wide character to multibyte representation.  */
  81       char buf[64]; /* Assume MB_LEN_MAX <= 64.  */
  82       mbstate_t state;
  83       size_t res;
  84
  85       memset (&state, 0, sizeof (mbstate_t));
  86       res = wcrtomb (buf, wc, &state);
  87       if (res != (size_t)(-1))
  88         fwrite (buf, 1, res, stream);
  89       else
  90         error (1, errno,
  91                _("cannot convert U+%04X to local character set"), code);
  92     }
  93   else
  94     error (1, 0, _("cannot convert U+%04X to local character set"), code);
  95 }
  96
  97 #else
  98
  99 /* When we pass a Unicode character to iconv(), we must pass it in a
 100    suitable encoding. The standardized Unicode encodings are
 101    UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
 102    UCS-2 supports only characters up to \U0000FFFF.
 103    UTF-16 and variants support only characters up to \U0010FFFF.
 104    UTF-7 is way too complex and not supported by glibc-2.1.
 105    UCS-4 specification leaves doubts about endianness and byte order
 106    mark. glibc currently interprets it as big endian without byte order
 107    mark, but this is not backed by an RFC.
 108    So we use UTF-8. It supports characters up to \U7FFFFFFF and is
 109    unambiguously defined.  */
 110
 111 /* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
 112    Returns the number of bytes stored, or -1 if wc is out of range.  */
 113 static int
 114 utf8_wctomb (unsigned char *r, unsigned int wc)
 115 {
 116   int count;
 117
 118   if (wc < 0x80)
 119     count = 1;
 120   else if (wc < 0x800)
 121     count = 2;
 122   else if (wc < 0x10000)
 123     count = 3;
 124   else if (wc < 0x200000)
 125     count = 4;
 126   else if (wc < 0x4000000)
 127     count = 5;
 128   else if (wc <= 0x7fffffff)
 129     count = 6;
 130   else
 131     return -1;
 132
 133   switch (count)
 134     {
 135       /* Note: code falls through cases! */
 136       case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
 137       case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
 138       case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
 139       case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
 140       case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
 141       case 1: r[0] = wc;
 142     }
 143
 144   return count;
 145 }
 146
 147 /* Luckily, the encoding's name is platform independent.  */
 148 # define UTF8_NAME "UTF-8"
 149
 150 /* Outputs the Unicode character CODE to the output stream STREAM.
 151    Assumes that the locale doesn't change between two calls.  */
 152 void
 153 print_unicode_char (FILE *stream, unsigned int code)
 154 {
 155   static int initialized;
 156   static int is_utf8;
 157 # if HAVE_ICONV
 158   static iconv_t utf8_to_local;
 159 # endif
 160
 161   char inbuf[6];
 162   int count;
 163
 164   if (!initialized)
 165     {
 166       extern const char *locale_charset PARAMS ((void));
 167       const char *charset = locale_charset ();
 168
 169       is_utf8 = (charset != NULL && !strcmp (charset, UTF8_NAME));
 170 # if HAVE_ICONV
 171       if (!is_utf8)
 172         {
 173           utf8_to_local = (charset != NULL
 174                            ? iconv_open (charset, UTF8_NAME)
 175                            : (iconv_t)(-1));
 176           if (utf8_to_local == (iconv_t)(-1))
 177             {
 178               /* For an unknown encoding, assume ASCII.  */
 179               utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
 180               if (utf8_to_local == (iconv_t)(-1))
 181                 error (1, 0,
 182                        _("cannot output U+%04X: iconv function not usable"),
 183                        code);
 184             }
 185         }
 186 # endif
 187       initialized = 1;
 188     }
 189
 190   /* Convert the character to UTF-8.  */
 191   count = utf8_wctomb ((unsigned char *) inbuf, code);
 192   if (count < 0)
 193     error (1, 0, _("U+%04X: character out of range"), code);
 194
 195   if (is_utf8)
 196     {
 197       fwrite (inbuf, 1, count, stream);
 198     }
 199   else
 200     {
 201 # if HAVE_ICONV
 202       char outbuf[25];
 203       const char *inptr;
 204       size_t inbytesleft;
 205       char *outptr;
 206       size_t outbytesleft;
 207       size_t res;
 208
 209       inptr = inbuf;
 210       inbytesleft = count;
 211       outptr = outbuf;
 212       outbytesleft = sizeof (outbuf);
 213
 214       /* Convert the character from UTF-8 to the locale's charset.  */
 215       res = iconv (utf8_to_local, &inptr, &inbytesleft, &outptr, &outbytesleft);
 216       if (inbytesleft > 0 || res == (size_t)(-1)
 217           /* Irix iconv() inserts a NUL byte if it cannot convert. */
 218 #  if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
 219           || (res > 0 && code != 0 && outptr - outbuf == 1 && *outbuf == '\0')
 220 #  endif
 221          )
 222         error (1, res == (size_t)(-1) ? errno : 0,
 223                _("cannot convert U+%04X to local character set"), code);
 224
 225       /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 226 #  if defined _LIBICONV_VERSION \
 227     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 228
 229       /* Get back to the initial shift state.  */
 230       res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
 231       if (res == (size_t)(-1))
 232         error (1, errno, _("cannot convert U+%04X to local character set"),
 233                code);
 234 #  endif
 235
 236       fwrite (outbuf, 1, outptr - outbuf, stream);
 237 # else
 238       error (1, 0, _("cannot output U+%04X: iconv function not available"),
 239              code);
 240 # endif
 241     }
 242 }
 243
 244 #endif