pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <locale.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <unicase.h>
  30 #include <unigbrk.h>
  31
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/compiler.h"
  34 #include "libpspp/hmapx.h"
  35 #include "libpspp/hash-functions.h"
  36 #include "libpspp/pool.h"
  37 #include "libpspp/str.h"
  38 #include "libpspp/version.h"
  39
  40 #include "gl/c-strcase.h"
  41 #include "gl/localcharset.h"
  42 #include "gl/minmax.h"
  43 #include "gl/xalloc.h"
  44 #include "gl/relocatable.h"
  45 #include "gl/xstrndup.h"
  46
  47 #include "gettext.h"
  48 #define _(msgid) gettext (msgid)
  49
  50 struct converter
  51  {
  52     char *tocode;
  53     char *fromcode;
  54     iconv_t conv;
  55     int error;
  56   };
  57
  58 static char *default_encoding;
  59 static struct hmapx map;
  60
  61 /* A wrapper around iconv_open */
  62 static struct converter *
  63 create_iconv__ (const char* tocode, const char* fromcode)
  64 {
  65   size_t hash;
  66   struct hmapx_node *node;
  67   struct converter *converter;
  68   assert (fromcode);
  69
  70   hash = hash_string (tocode, hash_string (fromcode, 0));
  71   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  72     if (!strcmp (tocode, converter->tocode)
  73         && !strcmp (fromcode, converter->fromcode))
  74       return converter;
  75
  76   converter = xmalloc (sizeof *converter);
  77   converter->tocode = xstrdup (tocode);
  78   converter->fromcode = xstrdup (fromcode);
  79   converter->conv = iconv_open (tocode, fromcode);
  80   converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
  81   hmapx_insert (&map, converter, hash);
  82
  83   return converter;
  84 }
  85
  86 static iconv_t
  87 create_iconv (const char* tocode, const char* fromcode)
  88 {
  89   struct converter *converter;
  90
  91   converter = create_iconv__ (tocode, fromcode);
  92
  93   /* I don't think it's safe to translate this string or to use messaging
  94      as the converters have not yet been set up */
  95   if (converter->error && strcmp (tocode, fromcode))
  96     {
  97       fprintf (stderr,
  98                "Warning: "
  99                "cannot create a converter for `%s' to `%s': %s\n",
 100                fromcode, tocode, strerror (converter->error));
 101       converter->error = 0;
 102     }
 103
 104   return converter->conv;
 105 }
 106
 107 /* Converts the single byte C from encoding FROM to TO, returning the first
 108    byte of the result.
 109
 110    This function probably shouldn't be used at all, but some code still does
 111    use it. */
 112 char
 113 recode_byte (const char *to, const char *from, char c)
 114 {
 115   char x;
 116   char *s = recode_string (to, from, &c, 1);
 117   x = s[0];
 118   free (s);
 119   return x;
 120 }
 121
 122 /* Similar to recode_string_pool, but allocates the returned value on the heap
 123    instead of in a pool.  It is the caller's responsibility to free the
 124    returned value. */
 125 char *
 126 recode_string (const char *to, const char *from,
 127                const char *text, int length)
 128 {
 129   return recode_string_pool (to, from, text, length, NULL);
 130 }
 131
 132 /* Returns the length, in bytes, of the string that a similar recode_string()
 133    call would return. */
 134 size_t
 135 recode_string_len (const char *to, const char *from,
 136                    const char *text, int length)
 137 {
 138   char *s = recode_string (to, from, text, length);
 139   size_t len = strlen (s);
 140   free (s);
 141   return len;
 142 }
 143
 144 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 145    at OP, and appends a null terminator to the output.
 146
 147    Returns the output length if successful, -1 if the output buffer is too
 148    small. */
 149 static ssize_t
 150 try_recode (iconv_t conv, char fallbackchar,
 151             const char *in, size_t inbytes,
 152             char *out_, size_t outbytes)
 153 {
 154   char *out = out_;
 155   int i;
 156
 157   /* Put the converter into the initial shift state, in case there was any
 158      state information left over from its last usage. */
 159   iconv (conv, NULL, 0, NULL, 0);
 160
 161   /* Do two rounds of iconv() calls:
 162
 163      - The first round does the bulk of the conversion using the
 164        caller-supplied input data..
 165
 166      - The second round flushes any leftover output.  This has a real effect
 167        with input encodings that use combining diacritics, e.g. without the
 168        second round the last character tends to gets dropped when converting
 169        from windows-1258 to other encodings.
 170   */
 171   for (i = 0; i < 2; i++)
 172     {
 173       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 174       size_t *inbytesp = i ? NULL : &inbytes;
 175
 176       while (iconv (conv, inp, inbytesp, &out, &outbytes) == -1)
 177         switch (errno)
 178           {
 179           case EINVAL:
 180             if (outbytes < 2)
 181               return -E2BIG;
 182             if (!fallbackchar)
 183               return -EINVAL;
 184             *out++ = fallbackchar;
 185             *out = '\0';
 186             return out - out_;
 187
 188           case EILSEQ:
 189             if (outbytes == 0)
 190               return -E2BIG;
 191             if (!fallbackchar)
 192               return -EILSEQ;
 193             *out++ = fallbackchar;
 194             outbytes--;
 195             if (inp)
 196               {
 197                 in++;
 198                 inbytes--;
 199               }
 200             break;
 201
 202           case E2BIG:
 203             return -E2BIG;
 204
 205           default:
 206             /* should never happen */
 207             fprintf (stderr, "Character conversion error: %s\n",
 208                      strerror (errno));
 209             NOT_REACHED ();
 210             break;
 211           }
 212     }
 213
 214   if (outbytes == 0)
 215     return -E2BIG;
 216
 217   *out = '\0';
 218   return out - out_;
 219 }
 220
 221 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 222    dynamically allocated string in TO-encoding.  Any characters which cannot be
 223    converted will be represented by '?'.
 224
 225    LENGTH should be the length of the string or -1, if null terminated.
 226
 227    The returned string will be allocated on POOL.
 228
 229    This function's behaviour differs from that of g_convert_with_fallback
 230    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 231    the input string is not valid in the declared input encoding.  This function
 232    however perseveres even in the presence of badly encoded input. */
 233 char *
 234 recode_string_pool (const char *to, const char *from,
 235                     const char *text, int length, struct pool *pool)
 236 {
 237   struct substring out;
 238
 239   if ( text == NULL )
 240     return NULL;
 241
 242   if ( length == -1 )
 243      length = strlen (text);
 244
 245   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 246   return out.string;
 247 }
 248
 249 /* Returns the name of the encoding that should be used for file names.
 250
 251    This is meant to be the same encoding used by g_filename_from_uri() and
 252    g_filename_to_uri() in GLib. */
 253 static const char *
 254 filename_encoding (void)
 255 {
 256 #if defined _WIN32 || defined __WIN32__
 257   return "UTF-8";
 258 #else
 259   return locale_charset ();
 260 #endif
 261 }
 262
 263 static char *
 264 xconcat2 (const char *a, size_t a_len,
 265           const char *b, size_t b_len)
 266 {
 267   char *s = xmalloc (a_len + b_len + 1);
 268   memcpy (s, a, a_len);
 269   memcpy (s + a_len, b, b_len);
 270   s[a_len + b_len] = '\0';
 271   return s;
 272 }
 273
 274 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 275    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 276    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 277    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 278    HEAD and tries again, repeating as necessary until the concatenated result
 279    fits or until HEAD_LEN reaches 0.
 280
 281    [*] Actually this function drops grapheme clusters instead of characters, so
 282        that, e.g. a Unicode character followed by a combining accent character
 283        is either completely included or completely excluded from HEAD_LEN.  See
 284        UAX #29 at http://unicode.org/reports/tr29/ for more information on
 285        grapheme clusters.
 286
 287    A null ENCODING is treated as UTF-8.
 288
 289    Sometimes this function has to actually construct the concatenated string to
 290    measure its length.  When this happens, it sets *RESULTP to that
 291    null-terminated string, allocated with malloc(), for the caller to use if it
 292    needs it.  Otherwise, it sets *RESULTP to NULL.
 293
 294    Simple examples for encoding="UTF-8", max_len=6:
 295
 296        head="abc",  tail="xyz"     => 3
 297        head="abcd", tail="xyz"     => 3 ("d" dropped).
 298        head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 299        head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 300
 301    Examples for encoding="ISO-8859-1", max_len=6:
 302
 303        head="éèä",  tail="xyz"     => 6
 304          (each letter in head is only 1 byte in ISO-8859-1 even though they
 305           each take 2 bytes in UTF-8 encoding)
 306 */
 307 static size_t
 308 utf8_encoding_concat__ (const char *head, size_t head_len,
 309                         const char *tail, size_t tail_len,
 310                         const char *encoding, size_t max_len,
 311                         char **resultp)
 312 {
 313   *resultp = NULL;
 314   if (head_len == 0)
 315     return 0;
 316   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 317     {
 318       if (head_len + tail_len <= max_len)
 319         return head_len;
 320       else if (tail_len >= max_len)
 321         return 0;
 322       else
 323         {
 324           size_t copy_len;
 325           ucs4_t prev;
 326           size_t ofs;
 327           int mblen;
 328
 329           copy_len = 0;
 330           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 331                                 head_len);
 332                ofs <= max_len - tail_len;
 333                ofs += mblen)
 334             {
 335               ucs4_t next;
 336
 337               mblen = u8_mbtouc (&next,
 338                                  CHAR_CAST (const uint8_t *, head + ofs),
 339                                  head_len - ofs);
 340               if (uc_is_grapheme_break (prev, next))
 341                 copy_len = ofs;
 342
 343               prev = next;
 344             }
 345
 346           return copy_len;
 347         }
 348     }
 349   else
 350     {
 351       char *result;
 352
 353       result = (tail_len > 0
 354                 ? xconcat2 (head, head_len, tail, tail_len)
 355                 : CONST_CAST (char *, head));
 356       if (recode_string_len (encoding, "UTF-8", result,
 357                              head_len + tail_len) <= max_len)
 358         {
 359           *resultp = result != head ? result : NULL;
 360           return head_len;
 361         }
 362       else
 363         {
 364           bool correct_result = false;
 365           size_t copy_len;
 366           ucs4_t prev;
 367           size_t ofs;
 368           int mblen;
 369
 370           copy_len = 0;
 371           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 372                                 head_len);
 373                ofs <= head_len;
 374                ofs += mblen)
 375             {
 376               ucs4_t next;
 377
 378               mblen = u8_mbtouc (&next,
 379                                  CHAR_CAST (const uint8_t *, head + ofs),
 380                                  head_len - ofs);
 381               if (uc_is_grapheme_break (prev, next))
 382                 {
 383                   if (result != head)
 384                     {
 385                       memcpy (result, head, ofs);
 386                       memcpy (result + ofs, tail, tail_len);
 387                       result[ofs + tail_len] = '\0';
 388                     }
 389
 390                   if (recode_string_len (encoding, "UTF-8", result,
 391                                          ofs + tail_len) <= max_len)
 392                     {
 393                       correct_result = true;
 394                       copy_len = ofs;
 395                     }
 396                   else
 397                     correct_result = false;
 398                 }
 399
 400               prev = next;
 401             }
 402
 403           if (result != head)
 404             {
 405               if (correct_result)
 406                 *resultp = result;
 407               else
 408                 free (result);
 409             }
 410
 411           return copy_len;
 412         }
 413     }
 414 }
 415
 416 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 417    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 418    string are all encoded in UTF-8.  As many characters[*] from the beginning
 419    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 420    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 421    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 422
 423    [*] Actually this function drops grapheme clusters instead of characters, so
 424        that, e.g. a Unicode character followed by a combining accent character
 425        is either completely included or completely excluded from the returned
 426        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 427        information on grapheme clusters.
 428
 429    A null ENCODING is treated as UTF-8.
 430
 431    Simple examples for encoding="UTF-8", max_len=6:
 432
 433        head="abc",  tail="xyz"     => "abcxyz"
 434        head="abcd", tail="xyz"     => "abcxyz"
 435        head="abc",  tail="uvwxyz"  => "uvwxyz"
 436        head="abc",  tail="tuvwxyz" => "tuvwxyz"
 437
 438    Examples for encoding="ISO-8859-1", max_len=6:
 439
 440        head="éèä",  tail="xyz"    => "éèäxyz"
 441          (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 442           each take 2 bytes in UTF-8 encoding)
 443 */
 444 char *
 445 utf8_encoding_concat (const char *head, const char *tail,
 446                       const char *encoding, size_t max_len)
 447 {
 448   size_t tail_len = strlen (tail);
 449   size_t prefix_len;
 450   char *result;
 451
 452   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 453                                        encoding, max_len, &result);
 454   return (result != NULL
 455           ? result
 456           : xconcat2 (head, prefix_len, tail, tail_len));
 457 }
 458
 459 /* Returns the length, in bytes, of the string that would be returned by
 460    utf8_encoding_concat() if passed the same arguments, but the implementation
 461    is often more efficient. */
 462 size_t
 463 utf8_encoding_concat_len (const char *head, const char *tail,
 464                           const char *encoding, size_t max_len)
 465 {
 466   size_t tail_len = strlen (tail);
 467   size_t prefix_len;
 468   char *result;
 469
 470   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 471                                        encoding, max_len, &result);
 472   free (result);
 473   return prefix_len + tail_len;
 474 }
 475
 476 /* Returns an allocated, null-terminated string, owned by the caller,
 477    containing as many characters[*] from the beginning of S that would fit
 478    within MAX_LEN bytes if the returned string were to be re-encoded in
 479    ENCODING.  Both S and the returned string are encoded in UTF-8.
 480
 481    [*] Actually this function drops grapheme clusters instead of characters, so
 482        that, e.g. a Unicode character followed by a combining accent character
 483        is either completely included or completely excluded from the returned
 484        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 485        information on grapheme clusters.
 486
 487    A null ENCODING is treated as UTF-8.
 488 */
 489 char *
 490 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 491 {
 492   return utf8_encoding_concat (s, "", encoding, max_len);
 493 }
 494
 495 /* Returns the length, in bytes, of the string that would be returned by
 496    utf8_encoding_trunc() if passed the same arguments, but the implementation
 497    is often more efficient. */
 498 size_t
 499 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 500 {
 501   return utf8_encoding_concat_len (s, "", encoding, max_len);
 502 }
 503
 504 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 505    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 506    current locale. */
 507 char *
 508 utf8_to_filename (const char *filename)
 509 {
 510   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 511 }
 512
 513 /* Returns FILENAME converted from the filename encoding to UTF-8.
 514    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 515    current locale. */
 516 char *
 517 filename_to_utf8 (const char *filename)
 518 {
 519   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 520 }
 521
 522 static int
 523 recode_substring_pool__ (const char *to, const char *from,
 524                          struct substring text, char fallbackchar,
 525                          struct pool *pool, struct substring *out)
 526 {
 527   size_t bufsize;
 528   iconv_t conv ;
 529
 530   if (to == NULL)
 531     to = default_encoding;
 532
 533   if (from == NULL)
 534     from = default_encoding;
 535
 536   conv = create_iconv (to, from);
 537
 538   if ( (iconv_t) -1 == conv )
 539     {
 540       if (fallbackchar)
 541         {
 542           out->string = pool_malloc (pool, text.length + 1);
 543           out->length = text.length;
 544           memcpy (out->string, text.string, text.length);
 545           out->string[out->length] = '\0';
 546           return 0;
 547         }
 548       else
 549         return EPROTO;
 550     }
 551
 552   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 553     {
 554       char *output = pool_malloc (pool, bufsize);
 555       ssize_t retval;
 556
 557       retval = try_recode (conv, fallbackchar, text.string, text.length,
 558                            output, bufsize);
 559       if (retval >= 0)
 560         {
 561           *out = ss_buffer (output, retval);
 562           return 0;
 563         }
 564       pool_free (pool, output);
 565
 566       if (retval != -E2BIG)
 567         return -retval;
 568     }
 569
 570   NOT_REACHED ();
 571 }
 572
 573 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 574    dynamically allocated string in TO-encoding.  Any characters which cannot be
 575    converted will be represented by '?'.
 576
 577    The returned string will be null-terminated and allocated on POOL with
 578    pool_malloc().
 579
 580    This function's behaviour differs from that of g_convert_with_fallback
 581    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 582    the input string is not valid in the declared input encoding.  This function
 583    however perseveres even in the presence of badly encoded input. */
 584 struct substring
 585 recode_substring_pool (const char *to, const char *from,
 586                        struct substring text, struct pool *pool)
 587 {
 588   struct substring out;
 589
 590   recode_substring_pool__ (to, from, text, '?', pool, &out);
 591   return out;
 592 }
 593
 594 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 595    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 596    converted null-terminated string, allocated from POOL with pool_malloc(), is
 597    stored in *OUT.  On failure, returns a positive errno value.
 598
 599    The function fails with an error if any part of the input string is not
 600    valid in the declared input encoding. */
 601 int
 602 recode_pedantically (const char *to, const char *from,
 603                      struct substring text, struct pool *pool,
 604                      struct substring *out)
 605 {
 606   int error;
 607
 608   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 609   if (error)
 610     *out = ss_empty ();
 611   return error;
 612 }
 613 \f
 614 void
 615 i18n_init (void)
 616 {
 617   setlocale (LC_ALL, "");
 618   bindtextdomain (PACKAGE, relocate(locale_dir));
 619   textdomain (PACKAGE);
 620
 621   assert (default_encoding == NULL);
 622   default_encoding = xstrdup (locale_charset ());
 623
 624   hmapx_init (&map);
 625 }
 626
 627 const char *
 628 get_default_encoding (void)
 629 {
 630   return default_encoding;
 631 }
 632
 633 void
 634 set_default_encoding (const char *enc)
 635 {
 636   free (default_encoding);
 637   default_encoding = xstrdup (enc);
 638 }
 639
 640
 641 /* Attempts to set the encoding from a locale name
 642    returns true if successfull.
 643    This function does not (should not!) alter the current locale.
 644 */
 645 bool
 646 set_encoding_from_locale (const char *loc)
 647 {
 648   bool ok = true;
 649   char *c_encoding;
 650   char *loc_encoding;
 651   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 652
 653   setlocale (LC_CTYPE, "C");
 654   c_encoding = xstrdup (locale_charset ());
 655
 656   setlocale (LC_CTYPE, loc);
 657   loc_encoding = xstrdup (locale_charset ());
 658
 659
 660   if ( 0 == strcmp (loc_encoding, c_encoding))
 661     {
 662       ok = false;
 663     }
 664
 665   setlocale (LC_CTYPE, tmp);
 666
 667   free (tmp);
 668
 669   if (ok)
 670     {
 671       free (default_encoding);
 672       default_encoding = loc_encoding;
 673     }
 674   else
 675     free (loc_encoding);
 676
 677   free (c_encoding);
 678
 679   return ok;
 680 }
 681
 682 void
 683 i18n_done (void)
 684 {
 685   struct hmapx_node *node;
 686   struct converter *cvtr;
 687
 688   HMAPX_FOR_EACH (cvtr, node, &map)
 689     {
 690       free (cvtr->tocode);
 691       free (cvtr->fromcode);
 692       if (cvtr->conv != (iconv_t) -1)
 693         iconv_close (cvtr->conv);
 694       free (cvtr);
 695     }
 696
 697   hmapx_destroy (&map);
 698
 699   free (default_encoding);
 700   default_encoding = NULL;
 701 }
 702
 703
 704
 705 bool
 706 valid_encoding (const char *enc)
 707 {
 708   iconv_t conv = iconv_open (UTF8, enc);
 709
 710   if ( conv == (iconv_t) -1)
 711     return false;
 712
 713   iconv_close (conv);
 714
 715   return true;
 716 }
 717
 718
 719 /* Return the system local's idea of the
 720    decimal seperator character */
 721 char
 722 get_system_decimal (void)
 723 {
 724   char radix_char;
 725
 726 #if HAVE_NL_LANGINFO
 727   radix_char = nl_langinfo (RADIXCHAR)[0];
 728 #else
 729   {
 730     char buf[10];
 731     snprintf (buf, sizeof buf, "%f", 2.5);
 732     radix_char = buf[1];
 733   }
 734 #endif
 735
 736   return radix_char;
 737 }
 738
 739 const char *
 740 uc_name (ucs4_t uc, char buffer[16])
 741 {
 742   if (uc >= 0x20 && uc < 0x7f)
 743     snprintf (buffer, 16, "`%c'", uc);
 744   else
 745     snprintf (buffer, 16, "U+%04X", uc);
 746   return buffer;
 747 }
 748 \f
 749 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 750
 751 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 752    with lowercase and uppercase letters treated as equal, starting from
 753    BASIS. */
 754 unsigned int
 755 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 756 {
 757   uint8_t folded_buf[2048];
 758   size_t folded_len = sizeof folded_buf;
 759   uint8_t *folded_s;
 760   unsigned int hash;
 761
 762   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 763                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 764   if (folded_s != NULL)
 765     {
 766       hash = hash_bytes (folded_s, folded_len, basis);
 767       if (folded_s != folded_buf)
 768         free (folded_s);
 769     }
 770   else
 771     {
 772       if (errno == ENOMEM)
 773         xalloc_die ();
 774       hash = hash_bytes (s, n, basis);
 775     }
 776
 777   return hash;
 778 }
 779
 780 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 781    uppercase letters treated as equal, starting from BASIS. */
 782 unsigned int
 783 utf8_hash_case_string (const char *s, unsigned int basis)
 784 {
 785   return utf8_hash_case_bytes (s, strlen (s), basis);
 786 }
 787
 788 /* Compares UTF-8 strings A and B case-insensitively.
 789    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 790 int
 791 utf8_strcasecmp (const char *a, const char *b)
 792 {
 793   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 794 }
 795
 796 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 797    case-insensitively.
 798    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 799 int
 800 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 801 {
 802   int result;
 803
 804   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 805                   CHAR_CAST (const uint8_t *, b), bn,
 806                   NULL, UNINORM_NFKD, &result))
 807     {
 808       if (errno == ENOMEM)
 809         xalloc_die ();
 810
 811       result = memcmp (a, b, MIN (an, bn));
 812       if (result == 0)
 813         result = an < bn ? -1 : an > bn;
 814     }
 815
 816   return result;
 817 }
 818
 819 static char *
 820 utf8_casemap (const char *s,
 821               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 822                              uint8_t *, size_t *))
 823 {
 824   char *result;
 825   size_t size;
 826
 827   result = CHAR_CAST (char *,
 828                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 829                          NULL, NULL, NULL, &size));
 830   if (result == NULL)
 831     {
 832       if (errno == ENOMEM)
 833         xalloc_die ();
 834
 835       result = xstrdup (s);
 836     }
 837   return result;
 838 }
 839
 840 char *
 841 utf8_to_upper (const char *s)
 842 {
 843   return utf8_casemap (s, u8_toupper);
 844 }
 845
 846 char *
 847 utf8_to_lower (const char *s)
 848 {
 849   return utf8_casemap (s, u8_tolower);
 850 }
 851 \f
 852 bool
 853 get_encoding_info (struct encoding_info *e, const char *name)
 854 {
 855   const struct substring in = SS_LITERAL_INITIALIZER (
 856     "\t\n\v\f\r "
 857     "!\"#$%&'()*+,-./0123456789:;<=>?@"
 858     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 859     "abcdefghijklmnopqrstuvwxyz{|}~");
 860
 861   struct substring out, cr, lf, space;
 862   bool ok;
 863
 864   memset (e, 0, sizeof *e);
 865
 866   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 867   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 868   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 869   ok = (cr.length >= 1
 870         && cr.length <= MAX_UNIT
 871         && cr.length == lf.length
 872         && cr.length == space.length);
 873   if (!ok)
 874     {
 875       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 876       ss_dealloc (&cr);
 877       ss_dealloc (&lf);
 878       ss_dealloc (&space);
 879       ss_alloc_substring (&cr, ss_cstr ("\r"));
 880       ss_alloc_substring (&lf, ss_cstr ("\n"));
 881       ss_alloc_substring (&space, ss_cstr (" "));
 882     }
 883
 884   e->unit = cr.length;
 885   memcpy (e->cr, cr.string, e->unit);
 886   memcpy (e->lf, lf.string, e->unit);
 887   memcpy (e->space, space.string, e->unit);
 888
 889   ss_dealloc (&cr);
 890   ss_dealloc (&lf);
 891   ss_dealloc (&space);
 892
 893   out = recode_substring_pool ("UTF-8", name, in, NULL);
 894   e->is_ascii_compatible = ss_equals (in, out);
 895   ss_dealloc (&out);
 896
 897   if (!e->is_ascii_compatible && e->unit == 1)
 898     {
 899       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 900       e->is_ebcdic_compatible = (out.length == 1
 901                                  && (uint8_t) out.string[0] == 0xc1);
 902       ss_dealloc (&out);
 903     }
 904   else
 905     e->is_ebcdic_compatible = false;
 906
 907   return ok;
 908 }
 909
 910 bool
 911 is_encoding_ascii_compatible (const char *encoding)
 912 {
 913   struct encoding_info e;
 914
 915   get_encoding_info (&e, encoding);
 916   return e.is_ascii_compatible;
 917 }
 918
 919 bool
 920 is_encoding_ebcdic_compatible (const char *encoding)
 921 {
 922   struct encoding_info e;
 923
 924   get_encoding_info (&e, encoding);
 925   return e.is_ebcdic_compatible;
 926 }
 927
 928 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 929    otherwise false. */
 930 bool
 931 is_encoding_supported (const char *encoding)
 932 {
 933   return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
 934           && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
 935 }
 936
 937 /* Returns true if E is the name of a UTF-8 encoding.
 938
 939    XXX Possibly we should test not E as a string but its properties via
 940    iconv. */
 941 bool
 942 is_encoding_utf8 (const char *e)
 943 {
 944   return ((e[0] == 'u' || e[0] == 'U')
 945           && (e[1] == 't' || e[1] == 'T')
 946           && (e[2] == 'f' || e[2] == 'F')
 947           && ((e[3] == '8' && e[4] == '\0')
 948               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 949 }
 950 \f
 951 static struct encoding_category *categories;
 952 static int n_categories;
 953
 954 static void SENTINEL (0)
 955 add_category (size_t *allocated_categories, const char *category, ...)
 956 {
 957   struct encoding_category *c;
 958   const char *encodings[16];
 959   va_list args;
 960   int i, n;
 961
 962   /* Count encoding arguments. */
 963   va_start (args, category);
 964   n = 0;
 965   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 966     {
 967       const char *encoding = encodings[n];
 968       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 969         n++;
 970     }
 971   assert (n < sizeof encodings / sizeof *encodings);
 972   va_end (args);
 973
 974   if (n == 0)
 975     return;
 976
 977   if (n_categories >= *allocated_categories)
 978     categories = x2nrealloc (categories,
 979                              allocated_categories, sizeof *categories);
 980
 981   c = &categories[n_categories++];
 982   c->category = category;
 983   c->encodings = xmalloc (n * sizeof *c->encodings);
 984   for (i = 0; i < n; i++)
 985     c->encodings[i] = encodings[i];
 986   c->n_encodings = n;
 987 }
 988
 989 static void
 990 init_encoding_categories (void)
 991 {
 992   static bool inited;
 993   size_t alloc;
 994
 995   if (inited)
 996     return;
 997   inited = true;
 998
 999   alloc = 0;
1000   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1001                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1002   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1003                 NULL_SENTINEL);
1004   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1005   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1006                 "Windows-1257", NULL_SENTINEL);
1007   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1008   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1009                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1010   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1011                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1012   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1013                 "EUC-TW", NULL_SENTINEL);
1014   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1015   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1016                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1017   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1018   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1019                 NULL_SENTINEL);
1020   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1021   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1022   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1023   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1024   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1025                 NULL_SENTINEL);
1026   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1027   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1028   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1029   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1030                 NULL_SENTINEL);
1031   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1032                 NULL_SENTINEL);
1033   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1034   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1035                 NULL_SENTINEL);
1036   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1037   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1038                 NULL_SENTINEL);
1039   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1040                 NULL_SENTINEL);
1041   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1042                 "Windows-1258", NULL_SENTINEL);
1043   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1044                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1045 }
1046
1047 /* Returns an array of "struct encoding_category" that contains only the
1048    categories and encodings that the system supports. */
1049 struct encoding_category *
1050 get_encoding_categories (void)
1051 {
1052   init_encoding_categories ();
1053   return categories;
1054 }
1055
1056 /* Returns the number of elements in the array returned by
1057    get_encoding_categories().  */
1058 size_t
1059 get_n_encoding_categories (void)
1060 {
1061   init_encoding_categories ();
1062   return n_categories;
1063 }