pintos-os.org Git - pspp/blob - lib/mbrtowc.c

   1 /* Convert multibyte character to wide character.
   2    Copyright (C) 1999-2002, 2005-2011 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2008.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include <wchar.h>
  22
  23 #if GNULIB_defined_mbstate_t
  24 /* Implement mbrtowc() on top of mbtowc().  */
  25
  26 # include <errno.h>
  27 # include <stdlib.h>
  28
  29 # include "localcharset.h"
  30 # include "streq.h"
  31 # include "verify.h"
  32
  33
  34 verify (sizeof (mbstate_t) >= 4);
  35
  36 static char internal_state[4];
  37
  38 size_t
  39 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  40 {
  41   char *pstate = (char *)ps;
  42
  43   if (s == NULL)
  44     {
  45       pwc = NULL;
  46       s = "";
  47       n = 1;
  48     }
  49
  50   if (n == 0)
  51     return (size_t)(-2);
  52
  53   /* Here n > 0.  */
  54
  55   if (pstate == NULL)
  56     pstate = internal_state;
  57
  58   {
  59     size_t nstate = pstate[0];
  60     char buf[4];
  61     const char *p;
  62     size_t m;
  63
  64     switch (nstate)
  65       {
  66       case 0:
  67         p = s;
  68         m = n;
  69         break;
  70       case 3:
  71         buf[2] = pstate[3];
  72         /*FALLTHROUGH*/
  73       case 2:
  74         buf[1] = pstate[2];
  75         /*FALLTHROUGH*/
  76       case 1:
  77         buf[0] = pstate[1];
  78         p = buf;
  79         m = nstate;
  80         buf[m++] = s[0];
  81         if (n >= 2 && m < 4)
  82           {
  83             buf[m++] = s[1];
  84             if (n >= 3 && m < 4)
  85               buf[m++] = s[2];
  86           }
  87         break;
  88       default:
  89         errno = EINVAL;
  90         return (size_t)(-1);
  91       }
  92
  93     /* Here m > 0.  */
  94
  95 # if __GLIBC__ || defined __UCLIBC__
  96     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
  97     mbtowc (NULL, NULL, 0);
  98 # endif
  99     {
 100       int res = mbtowc (pwc, p, m);
 101
 102       if (res >= 0)
 103         {
 104           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
 105             abort ();
 106           if (nstate >= (res > 0 ? res : 1))
 107             abort ();
 108           res -= nstate;
 109           pstate[0] = 0;
 110           return res;
 111         }
 112
 113       /* mbtowc does not distinguish between invalid and incomplete multibyte
 114          sequences.  But mbrtowc needs to make this distinction.
 115          There are two possible approaches:
 116            - Use iconv() and its return value.
 117            - Use built-in knowledge about the possible encodings.
 118          Given the low quality of implementation of iconv() on the systems that
 119          lack mbrtowc(), we use the second approach.
 120          The possible encodings are:
 121            - 8-bit encodings,
 122            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
 123            - UTF-8.
 124          Use specialized code for each.  */
 125       if (m >= 4 || m >= MB_CUR_MAX)
 126         goto invalid;
 127       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
 128       {
 129         const char *encoding = locale_charset ();
 130
 131         if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
 132           {
 133             /* Cf. unistr/u8-mblen.c.  */
 134             unsigned char c = (unsigned char) p[0];
 135
 136             if (c >= 0xc2)
 137               {
 138                 if (c < 0xe0)
 139                   {
 140                     if (m == 1)
 141                       goto incomplete;
 142                   }
 143                 else if (c < 0xf0)
 144                   {
 145                     if (m == 1)
 146                       goto incomplete;
 147                     if (m == 2)
 148                       {
 149                         unsigned char c2 = (unsigned char) p[1];
 150
 151                         if ((c2 ^ 0x80) < 0x40
 152                             && (c >= 0xe1 || c2 >= 0xa0)
 153                             && (c != 0xed || c2 < 0xa0))
 154                           goto incomplete;
 155                       }
 156                   }
 157                 else if (c <= 0xf4)
 158                   {
 159                     if (m == 1)
 160                       goto incomplete;
 161                     else /* m == 2 || m == 3 */
 162                       {
 163                         unsigned char c2 = (unsigned char) p[1];
 164
 165                         if ((c2 ^ 0x80) < 0x40
 166                             && (c >= 0xf1 || c2 >= 0x90)
 167                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
 168                           {
 169                             if (m == 2)
 170                               goto incomplete;
 171                             else /* m == 3 */
 172                               {
 173                                 unsigned char c3 = (unsigned char) p[2];
 174
 175                                 if ((c3 ^ 0x80) < 0x40)
 176                                   goto incomplete;
 177                               }
 178                           }
 179                       }
 180                   }
 181               }
 182             goto invalid;
 183           }
 184
 185         /* As a reference for this code, you can use the GNU libiconv
 186            implementation.  Look for uses of the RET_TOOFEW macro.  */
 187
 188         if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
 189           {
 190             if (m == 1)
 191               {
 192                 unsigned char c = (unsigned char) p[0];
 193
 194                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
 195                   goto incomplete;
 196               }
 197             if (m == 2)
 198               {
 199                 unsigned char c = (unsigned char) p[0];
 200
 201                 if (c == 0x8f)
 202                   {
 203                     unsigned char c2 = (unsigned char) p[1];
 204
 205                     if (c2 >= 0xa1 && c2 < 0xff)
 206                       goto incomplete;
 207                   }
 208               }
 209             goto invalid;
 210           }
 211         if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 212             || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 213             || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
 214           {
 215             if (m == 1)
 216               {
 217                 unsigned char c = (unsigned char) p[0];
 218
 219                 if (c >= 0xa1 && c < 0xff)
 220                   goto incomplete;
 221               }
 222             goto invalid;
 223           }
 224         if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
 225           {
 226             if (m == 1)
 227               {
 228                 unsigned char c = (unsigned char) p[0];
 229
 230                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
 231                   goto incomplete;
 232               }
 233             else /* m == 2 || m == 3 */
 234               {
 235                 unsigned char c = (unsigned char) p[0];
 236
 237                 if (c == 0x8e)
 238                   goto incomplete;
 239               }
 240             goto invalid;
 241           }
 242         if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
 243           {
 244             if (m == 1)
 245               {
 246                 unsigned char c = (unsigned char) p[0];
 247
 248                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
 249                   goto incomplete;
 250               }
 251             else /* m == 2 || m == 3 */
 252               {
 253                 unsigned char c = (unsigned char) p[0];
 254
 255                 if (c >= 0x90 && c <= 0xe3)
 256                   {
 257                     unsigned char c2 = (unsigned char) p[1];
 258
 259                     if (c2 >= 0x30 && c2 <= 0x39)
 260                       {
 261                         if (m == 2)
 262                           goto incomplete;
 263                         else /* m == 3 */
 264                           {
 265                             unsigned char c3 = (unsigned char) p[2];
 266
 267                             if (c3 >= 0x81 && c3 <= 0xfe)
 268                               goto incomplete;
 269                           }
 270                       }
 271                   }
 272               }
 273             goto invalid;
 274           }
 275         if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
 276           {
 277             if (m == 1)
 278               {
 279                 unsigned char c = (unsigned char) p[0];
 280
 281                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
 282                     || (c >= 0xf0 && c <= 0xf9))
 283                   goto incomplete;
 284               }
 285             goto invalid;
 286           }
 287
 288         /* An unknown multibyte encoding.  */
 289         goto incomplete;
 290       }
 291
 292      incomplete:
 293       {
 294         size_t k = nstate;
 295         /* Here 0 <= k < m < 4.  */
 296         pstate[++k] = s[0];
 297         if (k < m)
 298           {
 299             pstate[++k] = s[1];
 300             if (k < m)
 301               pstate[++k] = s[2];
 302           }
 303         if (k != m)
 304           abort ();
 305       }
 306       pstate[0] = m;
 307       return (size_t)(-2);
 308
 309      invalid:
 310       errno = EILSEQ;
 311       /* The conversion state is undefined, says POSIX.  */
 312       return (size_t)(-1);
 313     }
 314   }
 315 }
 316
 317 #else
 318 /* Override the system's mbrtowc() function.  */
 319
 320 # undef mbrtowc
 321
 322 size_t
 323 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 324 {
 325 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG
 326   if (s == NULL)
 327     {
 328       pwc = NULL;
 329       s = "";
 330       n = 1;
 331     }
 332 # endif
 333
 334 # if MBRTOWC_RETVAL_BUG
 335   {
 336     static mbstate_t internal_state;
 337
 338     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
 339        hidden internal state, but we can call it on our variable.  */
 340     if (ps == NULL)
 341       ps = &internal_state;
 342
 343     if (!mbsinit (ps))
 344       {
 345         /* Parse the rest of the multibyte character byte for byte.  */
 346         size_t count = 0;
 347         for (; n > 0; s++, n--)
 348           {
 349             wchar_t wc;
 350             size_t ret = mbrtowc (&wc, s, 1, ps);
 351
 352             if (ret == (size_t)(-1))
 353               return (size_t)(-1);
 354             count++;
 355             if (ret != (size_t)(-2))
 356               {
 357                 /* The multibyte character has been completed.  */
 358                 if (pwc != NULL)
 359                   *pwc = wc;
 360                 return (wc == 0 ? 0 : count);
 361               }
 362           }
 363         return (size_t)(-2);
 364       }
 365   }
 366 # endif
 367
 368 # if MBRTOWC_NUL_RETVAL_BUG
 369   {
 370     wchar_t wc;
 371     size_t ret = mbrtowc (&wc, s, n, ps);
 372
 373     if (ret != (size_t)(-1) && ret != (size_t)(-2))
 374       {
 375         if (pwc != NULL)
 376           *pwc = wc;
 377         if (wc == 0)
 378           ret = 0;
 379       }
 380     return ret;
 381   }
 382 # else
 383   {
 384 #   if MBRTOWC_NULL_ARG1_BUG
 385     wchar_t dummy;
 386
 387     if (pwc == NULL)
 388       pwc = &dummy;
 389 #   endif
 390
 391     return mbrtowc (pwc, s, n, ps);
 392   }
 393 # endif
 394 }
 395
 396 #endif