pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <libintl.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unicase.h>
  31 #include <unigbrk.h>
  32
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/compiler.h"
  35 #include "libpspp/hmapx.h"
  36 #include "libpspp/hash-functions.h"
  37 #include "libpspp/pool.h"
  38 #include "libpspp/str.h"
  39 #include "libpspp/version.h"
  40
  41 #include "gl/c-strcase.h"
  42 #include "gl/localcharset.h"
  43 #include "gl/minmax.h"
  44 #include "gl/xalloc.h"
  45 #include "gl/relocatable.h"
  46 #include "gl/xstrndup.h"
  47
  48 #include "gettext.h"
  49 #define _(msgid) gettext (msgid)
  50
  51 struct converter
  52  {
  53     char *tocode;
  54     char *fromcode;
  55     iconv_t conv;
  56     int error;
  57   };
  58
  59 static char *default_encoding;
  60 static struct hmapx map;
  61
  62 /* A wrapper around iconv_open */
  63 static struct converter *
  64 create_iconv__ (const char* tocode, const char* fromcode)
  65 {
  66   size_t hash;
  67   struct hmapx_node *node;
  68   struct converter *converter;
  69   assert (fromcode);
  70
  71   hash = hash_string (tocode, hash_string (fromcode, 0));
  72   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  73     if (!strcmp (tocode, converter->tocode)
  74         && !strcmp (fromcode, converter->fromcode))
  75       return converter;
  76
  77   converter = xmalloc (sizeof *converter);
  78   converter->tocode = xstrdup (tocode);
  79   converter->fromcode = xstrdup (fromcode);
  80   converter->conv = iconv_open (tocode, fromcode);
  81   converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
  82   hmapx_insert (&map, converter, hash);
  83
  84   return converter;
  85 }
  86
  87 static iconv_t
  88 create_iconv (const char* tocode, const char* fromcode)
  89 {
  90   struct converter *converter;
  91
  92   converter = create_iconv__ (tocode, fromcode);
  93
  94   /* I don't think it's safe to translate this string or to use messaging
  95      as the converters have not yet been set up */
  96   if (converter->error && strcmp (tocode, fromcode))
  97     {
  98       fprintf (stderr,
  99                "Warning: "
 100                "cannot create a converter for `%s' to `%s': %s\n",
 101                fromcode, tocode, strerror (converter->error));
 102       converter->error = 0;
 103     }
 104
 105   return converter->conv;
 106 }
 107
 108 /* Converts the single byte C from encoding FROM to TO, returning the first
 109    byte of the result.
 110
 111    This function probably shouldn't be used at all, but some code still does
 112    use it. */
 113 char
 114 recode_byte (const char *to, const char *from, char c)
 115 {
 116   char x;
 117   char *s = recode_string (to, from, &c, 1);
 118   x = s[0];
 119   free (s);
 120   return x;
 121 }
 122
 123 /* Similar to recode_string_pool, but allocates the returned value on the heap
 124    instead of in a pool.  It is the caller's responsibility to free the
 125    returned value. */
 126 char *
 127 recode_string (const char *to, const char *from,
 128                const char *text, int length)
 129 {
 130   return recode_string_pool (to, from, text, length, NULL);
 131 }
 132
 133 /* Returns the length, in bytes, of the string that a similar recode_string()
 134    call would return. */
 135 size_t
 136 recode_string_len (const char *to, const char *from,
 137                    const char *text, int length)
 138 {
 139   char *s = recode_string (to, from, text, length);
 140   size_t len = strlen (s);
 141   free (s);
 142   return len;
 143 }
 144
 145 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 146    at OP, and appends a null terminator to the output.
 147
 148    Returns the output length if successful, -1 if the output buffer is too
 149    small. */
 150 static ssize_t
 151 try_recode (iconv_t conv, char fallbackchar,
 152             const char *in, size_t inbytes,
 153             char *out_, size_t outbytes)
 154 {
 155   char *out = out_;
 156   int i;
 157
 158   /* Put the converter into the initial shift state, in case there was any
 159      state information left over from its last usage. */
 160   iconv (conv, NULL, 0, NULL, 0);
 161
 162   /* Do two rounds of iconv() calls:
 163
 164      - The first round does the bulk of the conversion using the
 165        caller-supplied input data..
 166
 167      - The second round flushes any leftover output.  This has a real effect
 168        with input encodings that use combining diacritics, e.g. without the
 169        second round the last character tends to gets dropped when converting
 170        from windows-1258 to other encodings.
 171   */
 172   for (i = 0; i < 2; i++)
 173     {
 174       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 175       size_t *inbytesp = i ? NULL : &inbytes;
 176
 177       while (iconv (conv, inp, inbytesp, &out, &outbytes) == -1)
 178         switch (errno)
 179           {
 180           case EINVAL:
 181             if (outbytes < 2)
 182               return -E2BIG;
 183             if (!fallbackchar)
 184               return -EINVAL;
 185             *out++ = fallbackchar;
 186             *out = '\0';
 187             return out - out_;
 188
 189           case EILSEQ:
 190             if (outbytes == 0)
 191               return -E2BIG;
 192             if (!fallbackchar)
 193               return -EILSEQ;
 194             *out++ = fallbackchar;
 195             outbytes--;
 196             if (inp)
 197               {
 198                 in++;
 199                 inbytes--;
 200               }
 201             break;
 202
 203           case E2BIG:
 204             return -E2BIG;
 205
 206           default:
 207             /* should never happen */
 208             fprintf (stderr, "Character conversion error: %s\n",
 209                      strerror (errno));
 210             NOT_REACHED ();
 211             break;
 212           }
 213     }
 214
 215   if (outbytes == 0)
 216     return -E2BIG;
 217
 218   *out = '\0';
 219   return out - out_;
 220 }
 221
 222 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 223    dynamically allocated string in TO-encoding.  Any characters which cannot be
 224    converted will be represented by '?'.
 225
 226    LENGTH should be the length of the string or -1, if null terminated.
 227
 228    The returned string will be allocated on POOL.
 229
 230    This function's behaviour differs from that of g_convert_with_fallback
 231    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 232    the input string is not valid in the declared input encoding.  This function
 233    however perseveres even in the presence of badly encoded input. */
 234 char *
 235 recode_string_pool (const char *to, const char *from,
 236                     const char *text, int length, struct pool *pool)
 237 {
 238   struct substring out;
 239
 240   if ( text == NULL )
 241     return NULL;
 242
 243   if ( length == -1 )
 244      length = strlen (text);
 245
 246   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 247   return out.string;
 248 }
 249
 250 /* Returns the name of the encoding that should be used for file names.
 251
 252    This is meant to be the same encoding used by g_filename_from_uri() and
 253    g_filename_to_uri() in GLib. */
 254 static const char *
 255 filename_encoding (void)
 256 {
 257 #if defined _WIN32 || defined __WIN32__
 258   return "UTF-8";
 259 #else
 260   return locale_charset ();
 261 #endif
 262 }
 263
 264 static char *
 265 xconcat2 (const char *a, size_t a_len,
 266           const char *b, size_t b_len)
 267 {
 268   char *s = xmalloc (a_len + b_len + 1);
 269   memcpy (s, a, a_len);
 270   memcpy (s + a_len, b, b_len);
 271   s[a_len + b_len] = '\0';
 272   return s;
 273 }
 274
 275 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 276    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 277    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 278    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 279    HEAD and tries again, repeating as necessary until the concatenated result
 280    fits or until HEAD_LEN reaches 0.
 281
 282    [*] Actually this function drops grapheme clusters instead of characters, so
 283        that, e.g. a Unicode character followed by a combining accent character
 284        is either completely included or completely excluded from HEAD_LEN.  See
 285        UAX #29 at http://unicode.org/reports/tr29/ for more information on
 286        grapheme clusters.
 287
 288    A null ENCODING is treated as UTF-8.
 289
 290    Sometimes this function has to actually construct the concatenated string to
 291    measure its length.  When this happens, it sets *RESULTP to that
 292    null-terminated string, allocated with malloc(), for the caller to use if it
 293    needs it.  Otherwise, it sets *RESULTP to NULL.
 294
 295    Simple examples for encoding="UTF-8", max_len=6:
 296
 297        head="abc",  tail="xyz"     => 3
 298        head="abcd", tail="xyz"     => 3 ("d" dropped).
 299        head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 300        head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 301
 302    Examples for encoding="ISO-8859-1", max_len=6:
 303
 304        head="éèä",  tail="xyz"     => 6
 305          (each letter in head is only 1 byte in ISO-8859-1 even though they
 306           each take 2 bytes in UTF-8 encoding)
 307 */
 308 static size_t
 309 utf8_encoding_concat__ (const char *head, size_t head_len,
 310                         const char *tail, size_t tail_len,
 311                         const char *encoding, size_t max_len,
 312                         char **resultp)
 313 {
 314   *resultp = NULL;
 315   if (head_len == 0)
 316     return 0;
 317   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 318     {
 319       if (head_len + tail_len <= max_len)
 320         return head_len;
 321       else if (tail_len >= max_len)
 322         return 0;
 323       else
 324         {
 325           size_t copy_len;
 326           ucs4_t prev;
 327           size_t ofs;
 328           int mblen;
 329
 330           copy_len = 0;
 331           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 332                                 head_len);
 333                ofs <= max_len - tail_len;
 334                ofs += mblen)
 335             {
 336               ucs4_t next;
 337
 338               mblen = u8_mbtouc (&next,
 339                                  CHAR_CAST (const uint8_t *, head + ofs),
 340                                  head_len - ofs);
 341               if (uc_is_grapheme_break (prev, next))
 342                 copy_len = ofs;
 343
 344               prev = next;
 345             }
 346
 347           return copy_len;
 348         }
 349     }
 350   else
 351     {
 352       char *result;
 353
 354       result = (tail_len > 0
 355                 ? xconcat2 (head, head_len, tail, tail_len)
 356                 : CONST_CAST (char *, head));
 357       if (recode_string_len (encoding, "UTF-8", result,
 358                              head_len + tail_len) <= max_len)
 359         {
 360           *resultp = result != head ? result : NULL;
 361           return head_len;
 362         }
 363       else
 364         {
 365           bool correct_result = false;
 366           size_t copy_len;
 367           ucs4_t prev;
 368           size_t ofs;
 369           int mblen;
 370
 371           copy_len = 0;
 372           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 373                                 head_len);
 374                ofs <= head_len;
 375                ofs += mblen)
 376             {
 377               ucs4_t next;
 378
 379               mblen = u8_mbtouc (&next,
 380                                  CHAR_CAST (const uint8_t *, head + ofs),
 381                                  head_len - ofs);
 382               if (uc_is_grapheme_break (prev, next))
 383                 {
 384                   if (result != head)
 385                     {
 386                       memcpy (result, head, ofs);
 387                       memcpy (result + ofs, tail, tail_len);
 388                       result[ofs + tail_len] = '\0';
 389                     }
 390
 391                   if (recode_string_len (encoding, "UTF-8", result,
 392                                          ofs + tail_len) <= max_len)
 393                     {
 394                       correct_result = true;
 395                       copy_len = ofs;
 396                     }
 397                   else
 398                     correct_result = false;
 399                 }
 400
 401               prev = next;
 402             }
 403
 404           if (result != head)
 405             {
 406               if (correct_result)
 407                 *resultp = result;
 408               else
 409                 free (result);
 410             }
 411
 412           return copy_len;
 413         }
 414     }
 415 }
 416
 417 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 418    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 419    string are all encoded in UTF-8.  As many characters[*] from the beginning
 420    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 421    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 422    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 423
 424    [*] Actually this function drops grapheme clusters instead of characters, so
 425        that, e.g. a Unicode character followed by a combining accent character
 426        is either completely included or completely excluded from the returned
 427        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 428        information on grapheme clusters.
 429
 430    A null ENCODING is treated as UTF-8.
 431
 432    Simple examples for encoding="UTF-8", max_len=6:
 433
 434        head="abc",  tail="xyz"     => "abcxyz"
 435        head="abcd", tail="xyz"     => "abcxyz"
 436        head="abc",  tail="uvwxyz"  => "uvwxyz"
 437        head="abc",  tail="tuvwxyz" => "tuvwxyz"
 438
 439    Examples for encoding="ISO-8859-1", max_len=6:
 440
 441        head="éèä",  tail="xyz"    => "éèäxyz"
 442          (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 443           each take 2 bytes in UTF-8 encoding)
 444 */
 445 char *
 446 utf8_encoding_concat (const char *head, const char *tail,
 447                       const char *encoding, size_t max_len)
 448 {
 449   size_t tail_len = strlen (tail);
 450   size_t prefix_len;
 451   char *result;
 452
 453   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 454                                        encoding, max_len, &result);
 455   return (result != NULL
 456           ? result
 457           : xconcat2 (head, prefix_len, tail, tail_len));
 458 }
 459
 460 /* Returns the length, in bytes, of the string that would be returned by
 461    utf8_encoding_concat() if passed the same arguments, but the implementation
 462    is often more efficient. */
 463 size_t
 464 utf8_encoding_concat_len (const char *head, const char *tail,
 465                           const char *encoding, size_t max_len)
 466 {
 467   size_t tail_len = strlen (tail);
 468   size_t prefix_len;
 469   char *result;
 470
 471   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 472                                        encoding, max_len, &result);
 473   free (result);
 474   return prefix_len + tail_len;
 475 }
 476
 477 /* Returns an allocated, null-terminated string, owned by the caller,
 478    containing as many characters[*] from the beginning of S that would fit
 479    within MAX_LEN bytes if the returned string were to be re-encoded in
 480    ENCODING.  Both S and the returned string are encoded in UTF-8.
 481
 482    [*] Actually this function drops grapheme clusters instead of characters, so
 483        that, e.g. a Unicode character followed by a combining accent character
 484        is either completely included or completely excluded from the returned
 485        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 486        information on grapheme clusters.
 487
 488    A null ENCODING is treated as UTF-8.
 489 */
 490 char *
 491 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 492 {
 493   return utf8_encoding_concat (s, "", encoding, max_len);
 494 }
 495
 496 /* Returns the length, in bytes, of the string that would be returned by
 497    utf8_encoding_trunc() if passed the same arguments, but the implementation
 498    is often more efficient. */
 499 size_t
 500 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 501 {
 502   return utf8_encoding_concat_len (s, "", encoding, max_len);
 503 }
 504
 505 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 506    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 507    current locale. */
 508 char *
 509 utf8_to_filename (const char *filename)
 510 {
 511   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 512 }
 513
 514 /* Returns FILENAME converted from the filename encoding to UTF-8.
 515    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 516    current locale. */
 517 char *
 518 filename_to_utf8 (const char *filename)
 519 {
 520   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 521 }
 522
 523 static int
 524 recode_substring_pool__ (const char *to, const char *from,
 525                          struct substring text, char fallbackchar,
 526                          struct pool *pool, struct substring *out)
 527 {
 528   size_t bufsize;
 529   iconv_t conv ;
 530
 531   if (to == NULL)
 532     to = default_encoding;
 533
 534   if (from == NULL)
 535     from = default_encoding;
 536
 537   conv = create_iconv (to, from);
 538
 539   if ( (iconv_t) -1 == conv )
 540     {
 541       if (fallbackchar)
 542         {
 543           out->string = pool_malloc (pool, text.length + 1);
 544           out->length = text.length;
 545           memcpy (out->string, text.string, text.length);
 546           out->string[out->length] = '\0';
 547           return 0;
 548         }
 549       else
 550         return EPROTO;
 551     }
 552
 553   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 554     {
 555       char *output = pool_malloc (pool, bufsize);
 556       ssize_t retval;
 557
 558       retval = try_recode (conv, fallbackchar, text.string, text.length,
 559                            output, bufsize);
 560       if (retval >= 0)
 561         {
 562           *out = ss_buffer (output, retval);
 563           return 0;
 564         }
 565       pool_free (pool, output);
 566
 567       if (retval != -E2BIG)
 568         return -retval;
 569     }
 570
 571   NOT_REACHED ();
 572 }
 573
 574 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 575    dynamically allocated string in TO-encoding.  Any characters which cannot be
 576    converted will be represented by '?'.
 577
 578    The returned string will be null-terminated and allocated on POOL with
 579    pool_malloc().
 580
 581    This function's behaviour differs from that of g_convert_with_fallback
 582    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 583    the input string is not valid in the declared input encoding.  This function
 584    however perseveres even in the presence of badly encoded input. */
 585 struct substring
 586 recode_substring_pool (const char *to, const char *from,
 587                        struct substring text, struct pool *pool)
 588 {
 589   struct substring out;
 590
 591   recode_substring_pool__ (to, from, text, '?', pool, &out);
 592   return out;
 593 }
 594
 595 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 596    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 597    converted null-terminated string, allocated from POOL with pool_malloc(), is
 598    stored in *OUT.  On failure, returns a positive errno value.
 599
 600    The function fails with an error if any part of the input string is not
 601    valid in the declared input encoding. */
 602 int
 603 recode_pedantically (const char *to, const char *from,
 604                      struct substring text, struct pool *pool,
 605                      struct substring *out)
 606 {
 607   int error;
 608
 609   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 610   if (error)
 611     *out = ss_empty ();
 612   return error;
 613 }
 614 \f
 615 void
 616 i18n_init (void)
 617 {
 618   setlocale (LC_ALL, "");
 619   bindtextdomain (PACKAGE, relocate(locale_dir));
 620   textdomain (PACKAGE);
 621
 622   assert (default_encoding == NULL);
 623   default_encoding = xstrdup (locale_charset ());
 624
 625   hmapx_init (&map);
 626 }
 627
 628 const char *
 629 get_default_encoding (void)
 630 {
 631   return default_encoding;
 632 }
 633
 634 void
 635 set_default_encoding (const char *enc)
 636 {
 637   free (default_encoding);
 638   default_encoding = xstrdup (enc);
 639 }
 640
 641
 642 /* Attempts to set the encoding from a locale name
 643    returns true if successfull.
 644    This function does not (should not!) alter the current locale.
 645 */
 646 bool
 647 set_encoding_from_locale (const char *loc)
 648 {
 649   bool ok = true;
 650   char *c_encoding;
 651   char *loc_encoding;
 652   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 653
 654   setlocale (LC_CTYPE, "C");
 655   c_encoding = xstrdup (locale_charset ());
 656
 657   setlocale (LC_CTYPE, loc);
 658   loc_encoding = xstrdup (locale_charset ());
 659
 660
 661   if ( 0 == strcmp (loc_encoding, c_encoding))
 662     {
 663       ok = false;
 664     }
 665
 666   setlocale (LC_CTYPE, tmp);
 667
 668   free (tmp);
 669
 670   if (ok)
 671     {
 672       free (default_encoding);
 673       default_encoding = loc_encoding;
 674     }
 675   else
 676     free (loc_encoding);
 677
 678   free (c_encoding);
 679
 680   return ok;
 681 }
 682
 683 void
 684 i18n_done (void)
 685 {
 686   struct hmapx_node *node;
 687   struct converter *cvtr;
 688
 689   HMAPX_FOR_EACH (cvtr, node, &map)
 690     {
 691       free (cvtr->tocode);
 692       free (cvtr->fromcode);
 693       if (cvtr->conv != (iconv_t) -1)
 694         iconv_close (cvtr->conv);
 695       free (cvtr);
 696     }
 697
 698   hmapx_destroy (&map);
 699
 700   free (default_encoding);
 701   default_encoding = NULL;
 702 }
 703
 704
 705
 706 bool
 707 valid_encoding (const char *enc)
 708 {
 709   iconv_t conv = iconv_open (UTF8, enc);
 710
 711   if ( conv == (iconv_t) -1)
 712     return false;
 713
 714   iconv_close (conv);
 715
 716   return true;
 717 }
 718
 719
 720 /* Return the system local's idea of the
 721    decimal seperator character */
 722 char
 723 get_system_decimal (void)
 724 {
 725   char radix_char;
 726
 727 #if HAVE_NL_LANGINFO
 728   radix_char = nl_langinfo (RADIXCHAR)[0];
 729 #else
 730   {
 731     char buf[10];
 732     snprintf (buf, sizeof buf, "%f", 2.5);
 733     radix_char = buf[1];
 734   }
 735 #endif
 736
 737   return radix_char;
 738 }
 739
 740 const char *
 741 uc_name (ucs4_t uc, char buffer[16])
 742 {
 743   if (uc >= 0x20 && uc < 0x7f)
 744     snprintf (buffer, 16, "`%c'", uc);
 745   else
 746     snprintf (buffer, 16, "U+%04X", uc);
 747   return buffer;
 748 }
 749 \f
 750 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 751
 752 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 753    with lowercase and uppercase letters treated as equal, starting from
 754    BASIS. */
 755 unsigned int
 756 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 757 {
 758   uint8_t folded_buf[2048];
 759   size_t folded_len = sizeof folded_buf;
 760   uint8_t *folded_s;
 761   unsigned int hash;
 762
 763   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 764                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 765   if (folded_s != NULL)
 766     {
 767       hash = hash_bytes (folded_s, folded_len, basis);
 768       if (folded_s != folded_buf)
 769         free (folded_s);
 770     }
 771   else
 772     {
 773       if (errno == ENOMEM)
 774         xalloc_die ();
 775       hash = hash_bytes (s, n, basis);
 776     }
 777
 778   return hash;
 779 }
 780
 781 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 782    uppercase letters treated as equal, starting from BASIS. */
 783 unsigned int
 784 utf8_hash_case_string (const char *s, unsigned int basis)
 785 {
 786   return utf8_hash_case_bytes (s, strlen (s), basis);
 787 }
 788
 789 /* Compares UTF-8 strings A and B case-insensitively.
 790    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 791 int
 792 utf8_strcasecmp (const char *a, const char *b)
 793 {
 794   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 795 }
 796
 797 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 798    case-insensitively.
 799    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 800 int
 801 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 802 {
 803   int result;
 804
 805   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 806                   CHAR_CAST (const uint8_t *, b), bn,
 807                   NULL, UNINORM_NFKD, &result))
 808     {
 809       if (errno == ENOMEM)
 810         xalloc_die ();
 811
 812       result = memcmp (a, b, MIN (an, bn));
 813       if (result == 0)
 814         result = an < bn ? -1 : an > bn;
 815     }
 816
 817   return result;
 818 }
 819
 820 static char *
 821 utf8_casemap (const char *s,
 822               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 823                              uint8_t *, size_t *))
 824 {
 825   char *result;
 826   size_t size;
 827
 828   result = CHAR_CAST (char *,
 829                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 830                          NULL, NULL, NULL, &size));
 831   if (result == NULL)
 832     {
 833       if (errno == ENOMEM)
 834         xalloc_die ();
 835
 836       result = xstrdup (s);
 837     }
 838   return result;
 839 }
 840
 841 char *
 842 utf8_to_upper (const char *s)
 843 {
 844   return utf8_casemap (s, u8_toupper);
 845 }
 846
 847 char *
 848 utf8_to_lower (const char *s)
 849 {
 850   return utf8_casemap (s, u8_tolower);
 851 }
 852 \f
 853 bool
 854 get_encoding_info (struct encoding_info *e, const char *name)
 855 {
 856   const struct substring in = SS_LITERAL_INITIALIZER (
 857     "\t\n\v\f\r "
 858     "!\"#$%&'()*+,-./0123456789:;<=>?@"
 859     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 860     "abcdefghijklmnopqrstuvwxyz{|}~");
 861
 862   struct substring out, cr, lf, space;
 863   bool ok;
 864
 865   memset (e, 0, sizeof *e);
 866
 867   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 868   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 869   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 870   ok = (cr.length >= 1
 871         && cr.length <= MAX_UNIT
 872         && cr.length == lf.length
 873         && cr.length == space.length);
 874   if (!ok)
 875     {
 876       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 877       ss_dealloc (&cr);
 878       ss_dealloc (&lf);
 879       ss_dealloc (&space);
 880       ss_alloc_substring (&cr, ss_cstr ("\r"));
 881       ss_alloc_substring (&lf, ss_cstr ("\n"));
 882       ss_alloc_substring (&space, ss_cstr (" "));
 883     }
 884
 885   e->unit = cr.length;
 886   memcpy (e->cr, cr.string, e->unit);
 887   memcpy (e->lf, lf.string, e->unit);
 888   memcpy (e->space, space.string, e->unit);
 889
 890   ss_dealloc (&cr);
 891   ss_dealloc (&lf);
 892   ss_dealloc (&space);
 893
 894   out = recode_substring_pool ("UTF-8", name, in, NULL);
 895   e->is_ascii_compatible = ss_equals (in, out);
 896   ss_dealloc (&out);
 897
 898   if (!e->is_ascii_compatible && e->unit == 1)
 899     {
 900       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 901       e->is_ebcdic_compatible = (out.length == 1
 902                                  && (uint8_t) out.string[0] == 0xc1);
 903       ss_dealloc (&out);
 904     }
 905   else
 906     e->is_ebcdic_compatible = false;
 907
 908   return ok;
 909 }
 910
 911 bool
 912 is_encoding_ascii_compatible (const char *encoding)
 913 {
 914   struct encoding_info e;
 915
 916   get_encoding_info (&e, encoding);
 917   return e.is_ascii_compatible;
 918 }
 919
 920 bool
 921 is_encoding_ebcdic_compatible (const char *encoding)
 922 {
 923   struct encoding_info e;
 924
 925   get_encoding_info (&e, encoding);
 926   return e.is_ebcdic_compatible;
 927 }
 928
 929 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 930    otherwise false. */
 931 bool
 932 is_encoding_supported (const char *encoding)
 933 {
 934   return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
 935           && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
 936 }
 937
 938 /* Returns true if E is the name of a UTF-8 encoding.
 939
 940    XXX Possibly we should test not E as a string but its properties via
 941    iconv. */
 942 bool
 943 is_encoding_utf8 (const char *e)
 944 {
 945   return ((e[0] == 'u' || e[0] == 'U')
 946           && (e[1] == 't' || e[1] == 'T')
 947           && (e[2] == 'f' || e[2] == 'F')
 948           && ((e[3] == '8' && e[4] == '\0')
 949               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 950 }
 951 \f
 952 static struct encoding_category *categories;
 953 static int n_categories;
 954
 955 static void SENTINEL (0)
 956 add_category (size_t *allocated_categories, const char *category, ...)
 957 {
 958   struct encoding_category *c;
 959   const char *encodings[16];
 960   va_list args;
 961   int i, n;
 962
 963   /* Count encoding arguments. */
 964   va_start (args, category);
 965   n = 0;
 966   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 967     {
 968       const char *encoding = encodings[n];
 969       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 970         n++;
 971     }
 972   assert (n < sizeof encodings / sizeof *encodings);
 973   va_end (args);
 974
 975   if (n == 0)
 976     return;
 977
 978   if (n_categories >= *allocated_categories)
 979     categories = x2nrealloc (categories,
 980                              allocated_categories, sizeof *categories);
 981
 982   c = &categories[n_categories++];
 983   c->category = category;
 984   c->encodings = xmalloc (n * sizeof *c->encodings);
 985   for (i = 0; i < n; i++)
 986     c->encodings[i] = encodings[i];
 987   c->n_encodings = n;
 988 }
 989
 990 static void
 991 init_encoding_categories (void)
 992 {
 993   static bool inited;
 994   size_t alloc;
 995
 996   if (inited)
 997     return;
 998   inited = true;
 999
1000   alloc = 0;
1001   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1002                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1003   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1004                 NULL_SENTINEL);
1005   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1006   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1007                 "Windows-1257", NULL_SENTINEL);
1008   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1009   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1010                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1011   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1012                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1013   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1014                 "EUC-TW", NULL_SENTINEL);
1015   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1016   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1017                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1018   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1019   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1020                 NULL_SENTINEL);
1021   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1022   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1023   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1024   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1025   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1026                 NULL_SENTINEL);
1027   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1028   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1029   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1030   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1031                 NULL_SENTINEL);
1032   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1033                 NULL_SENTINEL);
1034   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1035   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1036                 NULL_SENTINEL);
1037   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1038   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1039                 NULL_SENTINEL);
1040   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1041                 NULL_SENTINEL);
1042   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1043                 "Windows-1258", NULL_SENTINEL);
1044   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1045                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1046 }
1047
1048 /* Returns an array of "struct encoding_category" that contains only the
1049    categories and encodings that the system supports. */
1050 struct encoding_category *
1051 get_encoding_categories (void)
1052 {
1053   init_encoding_categories ();
1054   return categories;
1055 }
1056
1057 /* Returns the number of elements in the array returned by
1058    get_encoding_categories().  */
1059 size_t
1060 get_n_encoding_categories (void)
1061 {
1062   init_encoding_categories ();
1063   return n_categories;
1064 }