pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <libintl.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unicase.h>
  31 #include <unigbrk.h>
  32
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/compiler.h"
  35 #include "libpspp/hmapx.h"
  36 #include "libpspp/hash-functions.h"
  37 #include "libpspp/pool.h"
  38 #include "libpspp/str.h"
  39 #include "libpspp/version.h"
  40
  41 #include "gl/c-strcase.h"
  42 #include "gl/localcharset.h"
  43 #include "gl/minmax.h"
  44 #include "gl/xalloc.h"
  45 #include "gl/relocatable.h"
  46 #include "gl/xstrndup.h"
  47
  48 #include "gettext.h"
  49 #define _(msgid) gettext (msgid)
  50
  51 struct converter
  52  {
  53     char *tocode;
  54     char *fromcode;
  55     iconv_t conv;
  56     int error;
  57   };
  58
  59 static char *default_encoding;
  60 static struct hmapx map;
  61
  62 /* A wrapper around iconv_open */
  63 static struct converter *
  64 create_iconv__ (const char* tocode, const char* fromcode)
  65 {
  66   size_t hash;
  67   struct hmapx_node *node;
  68   struct converter *converter;
  69   assert (fromcode);
  70
  71   hash = hash_string (tocode, hash_string (fromcode, 0));
  72   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  73     if (!strcmp (tocode, converter->tocode)
  74         && !strcmp (fromcode, converter->fromcode))
  75       return converter;
  76
  77   converter = xmalloc (sizeof *converter);
  78   converter->tocode = xstrdup (tocode);
  79   converter->fromcode = xstrdup (fromcode);
  80   converter->conv = iconv_open (tocode, fromcode);
  81   converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
  82   hmapx_insert (&map, converter, hash);
  83
  84   return converter;
  85 }
  86
  87 static iconv_t
  88 create_iconv (const char* tocode, const char* fromcode)
  89 {
  90   struct converter *converter;
  91
  92   converter = create_iconv__ (tocode, fromcode);
  93
  94   /* I don't think it's safe to translate this string or to use messaging
  95      as the converters have not yet been set up */
  96   if (converter->error && strcmp (tocode, fromcode))
  97     {
  98       fprintf (stderr,
  99                "Warning: "
 100                "cannot create a converter for `%s' to `%s': %s\n",
 101                fromcode, tocode, strerror (converter->error));
 102       converter->error = 0;
 103     }
 104
 105   return converter->conv;
 106 }
 107
 108 /* Converts the single byte C from encoding FROM to TO, returning the first
 109    byte of the result.
 110
 111    This function probably shouldn't be used at all, but some code still does
 112    use it. */
 113 char
 114 recode_byte (const char *to, const char *from, char c)
 115 {
 116   char x;
 117   char *s = recode_string (to, from, &c, 1);
 118   x = s[0];
 119   free (s);
 120   return x;
 121 }
 122
 123 /* Similar to recode_string_pool, but allocates the returned value on the heap
 124    instead of in a pool.  It is the caller's responsibility to free the
 125    returned value. */
 126 char *
 127 recode_string (const char *to, const char *from,
 128                const char *text, int length)
 129 {
 130   return recode_string_pool (to, from, text, length, NULL);
 131 }
 132
 133 /* Returns the length, in bytes, of the string that a similar recode_string()
 134    call would return. */
 135 size_t
 136 recode_string_len (const char *to, const char *from,
 137                    const char *text, int length)
 138 {
 139   char *s = recode_string (to, from, text, length);
 140   size_t len = strlen (s);
 141   free (s);
 142   return len;
 143 }
 144
 145 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 146    at OP, and appends a null terminator to the output.
 147
 148    Returns the output length if successful, -1 if the output buffer is too
 149    small. */
 150 static ssize_t
 151 try_recode (iconv_t conv,
 152             const char *ip, size_t inbytes,
 153             char *op_, size_t outbytes)
 154 {
 155   /* FIXME: Need to ensure that this char is valid in the target encoding */
 156   const char fallbackchar = '?';
 157   char *op = op_;
 158
 159   /* Put the converter into the initial shift state, in case there was any
 160      state information left over from its last usage. */
 161   iconv (conv, NULL, 0, NULL, 0);
 162
 163   while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
 164                 &op, &outbytes) == -1)
 165     switch (errno)
 166       {
 167       case EINVAL:
 168         if (outbytes < 2)
 169           return -1;
 170         *op++ = fallbackchar;
 171         *op = '\0';
 172         return op - op_;
 173
 174       case EILSEQ:
 175         if (outbytes == 0)
 176           return -1;
 177         *op++ = fallbackchar;
 178         outbytes--;
 179         ip++;
 180         inbytes--;
 181         break;
 182
 183       case E2BIG:
 184         return -1;
 185
 186       default:
 187         /* should never happen */
 188         fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
 189         NOT_REACHED ();
 190         break;
 191       }
 192
 193   if (outbytes == 0)
 194     return -1;
 195
 196   *op = '\0';
 197   return op - op_;
 198 }
 199
 200 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 201    dynamically allocated string in TO-encoding.  Any characters which cannot be
 202    converted will be represented by '?'.
 203
 204    LENGTH should be the length of the string or -1, if null terminated.
 205
 206    The returned string will be allocated on POOL.
 207
 208    This function's behaviour differs from that of g_convert_with_fallback
 209    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 210    the input string is not valid in the declared input encoding.  This function
 211    however perseveres even in the presence of badly encoded input. */
 212 char *
 213 recode_string_pool (const char *to, const char *from,
 214                     const char *text, int length, struct pool *pool)
 215 {
 216   struct substring out;
 217
 218   if ( text == NULL )
 219     return NULL;
 220
 221   if ( length == -1 )
 222      length = strlen (text);
 223
 224   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 225   return out.string;
 226 }
 227
 228 /* Returns the name of the encoding that should be used for file names.
 229
 230    This is meant to be the same encoding used by g_filename_from_uri() and
 231    g_filename_to_uri() in GLib. */
 232 static const char *
 233 filename_encoding (void)
 234 {
 235 #if defined _WIN32 || defined __WIN32__
 236   return "UTF-8";
 237 #else
 238   return locale_charset ();
 239 #endif
 240 }
 241
 242 static char *
 243 xconcat2 (const char *a, size_t a_len,
 244           const char *b, size_t b_len)
 245 {
 246   char *s = xmalloc (a_len + b_len + 1);
 247   memcpy (s, a, a_len);
 248   memcpy (s + a_len, b, b_len);
 249   s[a_len + b_len] = '\0';
 250   return s;
 251 }
 252
 253 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 254    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 255    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 256    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 257    HEAD and tries again, repeating as necessary until the concatenated result
 258    fits or until HEAD_LEN reaches 0.
 259
 260    [*] Actually this function drops grapheme clusters instead of characters, so
 261        that, e.g. a Unicode character followed by a combining accent character
 262        is either completely included or completely excluded from HEAD_LEN.  See
 263        UAX #29 at http://unicode.org/reports/tr29/ for more information on
 264        grapheme clusters.
 265
 266    A null ENCODING is treated as UTF-8.
 267
 268    Sometimes this function has to actually construct the concatenated string to
 269    measure its length.  When this happens, it sets *RESULTP to that
 270    null-terminated string, allocated with malloc(), for the caller to use if it
 271    needs it.  Otherwise, it sets *RESULTP to NULL.
 272
 273    Simple examples for encoding="UTF-8", max_len=6:
 274
 275        head="abc",  tail="xyz"     => 3
 276        head="abcd", tail="xyz"     => 3 ("d" dropped).
 277        head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 278        head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 279
 280    Examples for encoding="ISO-8859-1", max_len=6:
 281
 282        head="éèä",  tail="xyz"     => 6
 283          (each letter in head is only 1 byte in ISO-8859-1 even though they
 284           each take 2 bytes in UTF-8 encoding)
 285 */
 286 static size_t
 287 utf8_encoding_concat__ (const char *head, size_t head_len,
 288                         const char *tail, size_t tail_len,
 289                         const char *encoding, size_t max_len,
 290                         char **resultp)
 291 {
 292   *resultp = NULL;
 293   if (head_len == 0)
 294     return 0;
 295   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 296     {
 297       if (head_len + tail_len <= max_len)
 298         return head_len;
 299       else if (tail_len >= max_len)
 300         return 0;
 301       else
 302         {
 303           size_t copy_len;
 304           ucs4_t prev;
 305           size_t ofs;
 306           int mblen;
 307
 308           copy_len = 0;
 309           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 310                                 head_len);
 311                ofs <= max_len - tail_len;
 312                ofs += mblen)
 313             {
 314               ucs4_t next;
 315
 316               mblen = u8_mbtouc (&next,
 317                                  CHAR_CAST (const uint8_t *, head + ofs),
 318                                  head_len - ofs);
 319               if (uc_is_grapheme_break (prev, next))
 320                 copy_len = ofs;
 321
 322               prev = next;
 323             }
 324
 325           return copy_len;
 326         }
 327     }
 328   else
 329     {
 330       char *result;
 331
 332       result = (tail_len > 0
 333                 ? xconcat2 (head, head_len, tail, tail_len)
 334                 : CONST_CAST (char *, head));
 335       if (recode_string_len (encoding, "UTF-8", result,
 336                              head_len + tail_len) <= max_len)
 337         {
 338           *resultp = result != head ? result : NULL;
 339           return head_len;
 340         }
 341       else
 342         {
 343           bool correct_result = false;
 344           size_t copy_len;
 345           ucs4_t prev;
 346           size_t ofs;
 347           int mblen;
 348
 349           copy_len = 0;
 350           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 351                                 head_len);
 352                ofs <= head_len;
 353                ofs += mblen)
 354             {
 355               ucs4_t next;
 356
 357               mblen = u8_mbtouc (&next,
 358                                  CHAR_CAST (const uint8_t *, head + ofs),
 359                                  head_len - ofs);
 360               if (uc_is_grapheme_break (prev, next))
 361                 {
 362                   if (result != head)
 363                     {
 364                       memcpy (result, head, ofs);
 365                       memcpy (result + ofs, tail, tail_len);
 366                       result[ofs + tail_len] = '\0';
 367                     }
 368
 369                   if (recode_string_len (encoding, "UTF-8", result,
 370                                          ofs + tail_len) <= max_len)
 371                     {
 372                       correct_result = true;
 373                       copy_len = ofs;
 374                     }
 375                   else
 376                     correct_result = false;
 377                 }
 378
 379               prev = next;
 380             }
 381
 382           if (result != head)
 383             {
 384               if (correct_result)
 385                 *resultp = result;
 386               else
 387                 free (result);
 388             }
 389
 390           return copy_len;
 391         }
 392     }
 393 }
 394
 395 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 396    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 397    string are all encoded in UTF-8.  As many characters[*] from the beginning
 398    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 399    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 400    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 401
 402    [*] Actually this function drops grapheme clusters instead of characters, so
 403        that, e.g. a Unicode character followed by a combining accent character
 404        is either completely included or completely excluded from the returned
 405        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 406        information on grapheme clusters.
 407
 408    A null ENCODING is treated as UTF-8.
 409
 410    Simple examples for encoding="UTF-8", max_len=6:
 411
 412        head="abc",  tail="xyz"     => "abcxyz"
 413        head="abcd", tail="xyz"     => "abcxyz"
 414        head="abc",  tail="uvwxyz"  => "uvwxyz"
 415        head="abc",  tail="tuvwxyz" => "tuvwxyz"
 416
 417    Examples for encoding="ISO-8859-1", max_len=6:
 418
 419        head="éèä",  tail="xyz"    => "éèäxyz"
 420          (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 421           each take 2 bytes in UTF-8 encoding)
 422 */
 423 char *
 424 utf8_encoding_concat (const char *head, const char *tail,
 425                       const char *encoding, size_t max_len)
 426 {
 427   size_t tail_len = strlen (tail);
 428   size_t prefix_len;
 429   char *result;
 430
 431   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 432                                        encoding, max_len, &result);
 433   return (result != NULL
 434           ? result
 435           : xconcat2 (head, prefix_len, tail, tail_len));
 436 }
 437
 438 /* Returns the length, in bytes, of the string that would be returned by
 439    utf8_encoding_concat() if passed the same arguments, but the implementation
 440    is often more efficient. */
 441 size_t
 442 utf8_encoding_concat_len (const char *head, const char *tail,
 443                           const char *encoding, size_t max_len)
 444 {
 445   size_t tail_len = strlen (tail);
 446   size_t prefix_len;
 447   char *result;
 448
 449   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 450                                        encoding, max_len, &result);
 451   free (result);
 452   return prefix_len + tail_len;
 453 }
 454
 455 /* Returns an allocated, null-terminated string, owned by the caller,
 456    containing as many characters[*] from the beginning of S that would fit
 457    within MAX_LEN bytes if the returned string were to be re-encoded in
 458    ENCODING.  Both S and the returned string are encoded in UTF-8.
 459
 460    [*] Actually this function drops grapheme clusters instead of characters, so
 461        that, e.g. a Unicode character followed by a combining accent character
 462        is either completely included or completely excluded from the returned
 463        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 464        information on grapheme clusters.
 465
 466    A null ENCODING is treated as UTF-8.
 467 */
 468 char *
 469 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 470 {
 471   return utf8_encoding_concat (s, "", encoding, max_len);
 472 }
 473
 474 /* Returns the length, in bytes, of the string that would be returned by
 475    utf8_encoding_trunc() if passed the same arguments, but the implementation
 476    is often more efficient. */
 477 size_t
 478 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 479 {
 480   return utf8_encoding_concat_len (s, "", encoding, max_len);
 481 }
 482
 483 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 484    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 485    current locale. */
 486 char *
 487 utf8_to_filename (const char *filename)
 488 {
 489   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 490 }
 491
 492 /* Returns FILENAME converted from the filename encoding to UTF-8.
 493    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 494    current locale. */
 495 char *
 496 filename_to_utf8 (const char *filename)
 497 {
 498   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 499 }
 500
 501 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 502    dynamically allocated string in TO-encoding.  Any characters which cannot be
 503    converted will be represented by '?'.
 504
 505    The returned string will be null-terminated and allocated on POOL.
 506
 507    This function's behaviour differs from that of g_convert_with_fallback
 508    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 509    the input string is not valid in the declared input encoding.  This function
 510    however perseveres even in the presence of badly encoded input. */
 511 struct substring
 512 recode_substring_pool (const char *to, const char *from,
 513                        struct substring text, struct pool *pool)
 514 {
 515   size_t outbufferlength;
 516   iconv_t conv ;
 517
 518   if (to == NULL)
 519     to = default_encoding;
 520
 521   if (from == NULL)
 522     from = default_encoding;
 523
 524   conv = create_iconv (to, from);
 525
 526   if ( (iconv_t) -1 == conv )
 527     {
 528       struct substring out;
 529       ss_alloc_substring_pool (&out, text, pool);
 530       return out;
 531     }
 532
 533   for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
 534     if ( outbufferlength > text.length)
 535       {
 536         char *output = pool_malloc (pool, outbufferlength);
 537         ssize_t output_len = try_recode (conv, text.string, text.length,
 538                                          output, outbufferlength);
 539         if (output_len >= 0)
 540           return ss_buffer (output, output_len);
 541         pool_free (pool, output);
 542       }
 543
 544   NOT_REACHED ();
 545 }
 546
 547 void
 548 i18n_init (void)
 549 {
 550   setlocale (LC_ALL, "");
 551   bindtextdomain (PACKAGE, relocate(locale_dir));
 552   textdomain (PACKAGE);
 553
 554   assert (default_encoding == NULL);
 555   default_encoding = xstrdup (locale_charset ());
 556
 557   hmapx_init (&map);
 558 }
 559
 560 const char *
 561 get_default_encoding (void)
 562 {
 563   return default_encoding;
 564 }
 565
 566 void
 567 set_default_encoding (const char *enc)
 568 {
 569   free (default_encoding);
 570   default_encoding = xstrdup (enc);
 571 }
 572
 573
 574 /* Attempts to set the encoding from a locale name
 575    returns true if successfull.
 576    This function does not (should not!) alter the current locale.
 577 */
 578 bool
 579 set_encoding_from_locale (const char *loc)
 580 {
 581   bool ok = true;
 582   char *c_encoding;
 583   char *loc_encoding;
 584   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 585
 586   setlocale (LC_CTYPE, "C");
 587   c_encoding = xstrdup (locale_charset ());
 588
 589   setlocale (LC_CTYPE, loc);
 590   loc_encoding = xstrdup (locale_charset ());
 591
 592
 593   if ( 0 == strcmp (loc_encoding, c_encoding))
 594     {
 595       ok = false;
 596     }
 597
 598   setlocale (LC_CTYPE, tmp);
 599
 600   free (tmp);
 601
 602   if (ok)
 603     {
 604       free (default_encoding);
 605       default_encoding = loc_encoding;
 606     }
 607   else
 608     free (loc_encoding);
 609
 610   free (c_encoding);
 611
 612   return ok;
 613 }
 614
 615 void
 616 i18n_done (void)
 617 {
 618   struct hmapx_node *node;
 619   struct converter *cvtr;
 620
 621   HMAPX_FOR_EACH (cvtr, node, &map)
 622     {
 623       free (cvtr->tocode);
 624       free (cvtr->fromcode);
 625       if (cvtr->conv != (iconv_t) -1)
 626         iconv_close (cvtr->conv);
 627       free (cvtr);
 628     }
 629
 630   hmapx_destroy (&map);
 631
 632   free (default_encoding);
 633   default_encoding = NULL;
 634 }
 635
 636
 637
 638 bool
 639 valid_encoding (const char *enc)
 640 {
 641   iconv_t conv = iconv_open (UTF8, enc);
 642
 643   if ( conv == (iconv_t) -1)
 644     return false;
 645
 646   iconv_close (conv);
 647
 648   return true;
 649 }
 650
 651
 652 /* Return the system local's idea of the
 653    decimal seperator character */
 654 char
 655 get_system_decimal (void)
 656 {
 657   char radix_char;
 658
 659 #if HAVE_NL_LANGINFO
 660   radix_char = nl_langinfo (RADIXCHAR)[0];
 661 #else
 662   {
 663     char buf[10];
 664     snprintf (buf, sizeof buf, "%f", 2.5);
 665     radix_char = buf[1];
 666   }
 667 #endif
 668
 669   return radix_char;
 670 }
 671
 672 const char *
 673 uc_name (ucs4_t uc, char buffer[16])
 674 {
 675   if (uc >= 0x20 && uc < 0x7f)
 676     snprintf (buffer, 16, "`%c'", uc);
 677   else
 678     snprintf (buffer, 16, "U+%04X", uc);
 679   return buffer;
 680 }
 681 \f
 682 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 683
 684 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 685    with lowercase and uppercase letters treated as equal, starting from
 686    BASIS. */
 687 unsigned int
 688 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 689 {
 690   uint8_t folded_buf[2048];
 691   size_t folded_len = sizeof folded_buf;
 692   uint8_t *folded_s;
 693   unsigned int hash;
 694
 695   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 696                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 697   if (folded_s != NULL)
 698     {
 699       hash = hash_bytes (folded_s, folded_len, basis);
 700       if (folded_s != folded_buf)
 701         free (folded_s);
 702     }
 703   else
 704     {
 705       if (errno == ENOMEM)
 706         xalloc_die ();
 707       hash = hash_bytes (s, n, basis);
 708     }
 709
 710   return hash;
 711 }
 712
 713 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 714    uppercase letters treated as equal, starting from BASIS. */
 715 unsigned int
 716 utf8_hash_case_string (const char *s, unsigned int basis)
 717 {
 718   return utf8_hash_case_bytes (s, strlen (s), basis);
 719 }
 720
 721 /* Compares UTF-8 strings A and B case-insensitively.
 722    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 723 int
 724 utf8_strcasecmp (const char *a, const char *b)
 725 {
 726   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 727 }
 728
 729 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 730    case-insensitively.
 731    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 732 int
 733 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 734 {
 735   int result;
 736
 737   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 738                   CHAR_CAST (const uint8_t *, b), bn,
 739                   NULL, UNINORM_NFKD, &result))
 740     {
 741       if (errno == ENOMEM)
 742         xalloc_die ();
 743
 744       result = memcmp (a, b, MIN (an, bn));
 745       if (result == 0)
 746         result = an < bn ? -1 : an > bn;
 747     }
 748
 749   return result;
 750 }
 751
 752 static char *
 753 utf8_casemap (const char *s,
 754               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 755                              uint8_t *, size_t *))
 756 {
 757   char *result;
 758   size_t size;
 759
 760   result = CHAR_CAST (char *,
 761                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 762                          NULL, NULL, NULL, &size));
 763   if (result == NULL)
 764     {
 765       if (errno == ENOMEM)
 766         xalloc_die ();
 767
 768       result = xstrdup (s);
 769     }
 770   return result;
 771 }
 772
 773 char *
 774 utf8_to_upper (const char *s)
 775 {
 776   return utf8_casemap (s, u8_toupper);
 777 }
 778
 779 char *
 780 utf8_to_lower (const char *s)
 781 {
 782   return utf8_casemap (s, u8_tolower);
 783 }
 784 \f
 785 bool
 786 get_encoding_info (struct encoding_info *e, const char *name)
 787 {
 788   const struct substring in = SS_LITERAL_INITIALIZER (
 789     "\t\n\v\f\r "
 790     "!\"#$%&'()*+,-./0123456789:;<=>?@"
 791     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 792     "abcdefghijklmnopqrstuvwxyz{|}~");
 793
 794   struct substring out, cr, lf, space;
 795   bool ok;
 796
 797   memset (e, 0, sizeof *e);
 798
 799   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 800   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 801   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 802   ok = (cr.length >= 1
 803         && cr.length <= MAX_UNIT
 804         && cr.length == lf.length
 805         && cr.length == space.length);
 806   if (!ok)
 807     {
 808       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 809       ss_dealloc (&cr);
 810       ss_dealloc (&lf);
 811       ss_dealloc (&space);
 812       ss_alloc_substring (&cr, ss_cstr ("\r"));
 813       ss_alloc_substring (&lf, ss_cstr ("\n"));
 814       ss_alloc_substring (&space, ss_cstr (" "));
 815     }
 816
 817   e->unit = cr.length;
 818   memcpy (e->cr, cr.string, e->unit);
 819   memcpy (e->lf, lf.string, e->unit);
 820   memcpy (e->space, space.string, e->unit);
 821
 822   ss_dealloc (&cr);
 823   ss_dealloc (&lf);
 824   ss_dealloc (&space);
 825
 826   out = recode_substring_pool ("UTF-8", name, in, NULL);
 827   e->is_ascii_compatible = ss_equals (in, out);
 828   ss_dealloc (&out);
 829
 830   if (!e->is_ascii_compatible && e->unit == 1)
 831     {
 832       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 833       e->is_ebcdic_compatible = (out.length == 1
 834                                  && (uint8_t) out.string[0] == 0xc1);
 835       ss_dealloc (&out);
 836     }
 837   else
 838     e->is_ebcdic_compatible = false;
 839
 840   return ok;
 841 }
 842
 843 bool
 844 is_encoding_ascii_compatible (const char *encoding)
 845 {
 846   struct encoding_info e;
 847
 848   get_encoding_info (&e, encoding);
 849   return e.is_ascii_compatible;
 850 }
 851
 852 bool
 853 is_encoding_ebcdic_compatible (const char *encoding)
 854 {
 855   struct encoding_info e;
 856
 857   get_encoding_info (&e, encoding);
 858   return e.is_ebcdic_compatible;
 859 }
 860
 861 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 862    otherwise false. */
 863 bool
 864 is_encoding_supported (const char *encoding)
 865 {
 866   return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
 867           && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
 868 }
 869
 870 /* Returns true if E is the name of a UTF-8 encoding.
 871
 872    XXX Possibly we should test not E as a string but its properties via
 873    iconv. */
 874 bool
 875 is_encoding_utf8 (const char *e)
 876 {
 877   return ((e[0] == 'u' || e[0] == 'U')
 878           && (e[1] == 't' || e[1] == 'T')
 879           && (e[2] == 'f' || e[2] == 'F')
 880           && ((e[3] == '8' && e[4] == '\0')
 881               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 882 }
 883 \f
 884 static struct encoding_category *categories;
 885 static int n_categories;
 886
 887 static void SENTINEL (0)
 888 add_category (size_t *allocated_categories, const char *category, ...)
 889 {
 890   struct encoding_category *c;
 891   const char *encodings[16];
 892   va_list args;
 893   int i, n;
 894
 895   /* Count encoding arguments. */
 896   va_start (args, category);
 897   n = 0;
 898   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 899     {
 900       const char *encoding = encodings[n];
 901       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 902         n++;
 903     }
 904   assert (n < sizeof encodings / sizeof *encodings);
 905   va_end (args);
 906
 907   if (n == 0)
 908     return;
 909
 910   if (n_categories >= *allocated_categories)
 911     categories = x2nrealloc (categories,
 912                              allocated_categories, sizeof *categories);
 913
 914   c = &categories[n_categories++];
 915   c->category = category;
 916   c->encodings = xmalloc (n * sizeof *c->encodings);
 917   for (i = 0; i < n; i++)
 918     c->encodings[i] = encodings[i];
 919   c->n_encodings = n;
 920 }
 921
 922 static void
 923 init_encoding_categories (void)
 924 {
 925   static bool inited;
 926   size_t alloc;
 927
 928   if (inited)
 929     return;
 930   inited = true;
 931
 932   alloc = 0;
 933   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
 934                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
 935   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
 936                 NULL_SENTINEL);
 937   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
 938   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
 939                 "Windows-1257", NULL_SENTINEL);
 940   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
 941   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
 942                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
 943   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
 944                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
 945   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
 946                 "EUC-TW", NULL_SENTINEL);
 947   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
 948   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
 949                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
 950   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
 951   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
 952                 NULL_SENTINEL);
 953   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
 954   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
 955   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
 956   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
 957   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
 958                 NULL_SENTINEL);
 959   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
 960   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
 961   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
 962   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
 963                 NULL_SENTINEL);
 964   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
 965                 NULL_SENTINEL);
 966   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
 967   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
 968                 NULL_SENTINEL);
 969   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
 970   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
 971                 NULL_SENTINEL);
 972   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
 973                 NULL_SENTINEL);
 974   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
 975                 "Windows-1258", NULL_SENTINEL);
 976   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
 977                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
 978 }
 979
 980 /* Returns an array of "struct encoding_category" that contains only the
 981    categories and encodings that the system supports. */
 982 struct encoding_category *
 983 get_encoding_categories (void)
 984 {
 985   init_encoding_categories ();
 986   return categories;
 987 }
 988
 989 /* Returns the number of elements in the array returned by
 990    get_encoding_categories().  */
 991 size_t
 992 get_n_encoding_categories (void)
 993 {
 994   init_encoding_categories ();
 995   return n_categories;
 996 }