pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <libintl.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unigbrk.h>
  31
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/compiler.h"
  34 #include "libpspp/hmapx.h"
  35 #include "libpspp/hash-functions.h"
  36 #include "libpspp/pool.h"
  37 #include "libpspp/str.h"
  38 #include "libpspp/version.h"
  39
  40 #include "gl/c-strcase.h"
  41 #include "gl/localcharset.h"
  42 #include "gl/xalloc.h"
  43 #include "gl/relocatable.h"
  44 #include "gl/xstrndup.h"
  45
  46 #include "gettext.h"
  47 #define _(msgid) gettext (msgid)
  48
  49 struct converter
  50  {
  51     char *tocode;
  52     char *fromcode;
  53     iconv_t conv;
  54     int error;
  55   };
  56
  57 static char *default_encoding;
  58 static struct hmapx map;
  59
  60 /* A wrapper around iconv_open */
  61 static struct converter *
  62 create_iconv__ (const char* tocode, const char* fromcode)
  63 {
  64   size_t hash;
  65   struct hmapx_node *node;
  66   struct converter *converter;
  67   assert (fromcode);
  68
  69   hash = hash_string (tocode, hash_string (fromcode, 0));
  70   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  71     if (!strcmp (tocode, converter->tocode)
  72         && !strcmp (fromcode, converter->fromcode))
  73       return converter;
  74
  75   converter = xmalloc (sizeof *converter);
  76   converter->tocode = xstrdup (tocode);
  77   converter->fromcode = xstrdup (fromcode);
  78   converter->conv = iconv_open (tocode, fromcode);
  79   converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
  80   hmapx_insert (&map, converter, hash);
  81
  82   return converter;
  83 }
  84
  85 static iconv_t
  86 create_iconv (const char* tocode, const char* fromcode)
  87 {
  88   struct converter *converter;
  89
  90   converter = create_iconv__ (tocode, fromcode);
  91
  92   /* I don't think it's safe to translate this string or to use messaging
  93      as the converters have not yet been set up */
  94   if (converter->error && strcmp (tocode, fromcode))
  95     {
  96       fprintf (stderr,
  97                "Warning: "
  98                "cannot create a converter for `%s' to `%s': %s\n",
  99                fromcode, tocode, strerror (converter->error));
 100       converter->error = 0;
 101     }
 102
 103   return converter->conv;
 104 }
 105
 106 /* Converts the single byte C from encoding FROM to TO, returning the first
 107    byte of the result.
 108
 109    This function probably shouldn't be used at all, but some code still does
 110    use it. */
 111 char
 112 recode_byte (const char *to, const char *from, char c)
 113 {
 114   char x;
 115   char *s = recode_string (to, from, &c, 1);
 116   x = s[0];
 117   free (s);
 118   return x;
 119 }
 120
 121 /* Similar to recode_string_pool, but allocates the returned value on the heap
 122    instead of in a pool.  It is the caller's responsibility to free the
 123    returned value. */
 124 char *
 125 recode_string (const char *to, const char *from,
 126                const char *text, int length)
 127 {
 128   return recode_string_pool (to, from, text, length, NULL);
 129 }
 130
 131 /* Returns the length, in bytes, of the string that a similar recode_string()
 132    call would return. */
 133 size_t
 134 recode_string_len (const char *to, const char *from,
 135                    const char *text, int length)
 136 {
 137   char *s = recode_string (to, from, text, length);
 138   size_t len = strlen (s);
 139   free (s);
 140   return len;
 141 }
 142
 143 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 144    at OP, and appends a null terminator to the output.
 145
 146    Returns the output length if successful, -1 if the output buffer is too
 147    small. */
 148 static ssize_t
 149 try_recode (iconv_t conv,
 150             const char *ip, size_t inbytes,
 151             char *op_, size_t outbytes)
 152 {
 153   /* FIXME: Need to ensure that this char is valid in the target encoding */
 154   const char fallbackchar = '?';
 155   char *op = op_;
 156
 157   /* Put the converter into the initial shift state, in case there was any
 158      state information left over from its last usage. */
 159   iconv (conv, NULL, 0, NULL, 0);
 160
 161   while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
 162                 &op, &outbytes) == -1)
 163     switch (errno)
 164       {
 165       case EINVAL:
 166         if (outbytes < 2)
 167           return -1;
 168         *op++ = fallbackchar;
 169         *op = '\0';
 170         return op - op_;
 171
 172       case EILSEQ:
 173         if (outbytes == 0)
 174           return -1;
 175         *op++ = fallbackchar;
 176         outbytes--;
 177         ip++;
 178         inbytes--;
 179         break;
 180
 181       case E2BIG:
 182         return -1;
 183
 184       default:
 185         /* should never happen */
 186         fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
 187         NOT_REACHED ();
 188         break;
 189       }
 190
 191   if (outbytes == 0)
 192     return -1;
 193
 194   *op = '\0';
 195   return op - op_;
 196 }
 197
 198 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 199    dynamically allocated string in TO-encoding.  Any characters which cannot be
 200    converted will be represented by '?'.
 201
 202    LENGTH should be the length of the string or -1, if null terminated.
 203
 204    The returned string will be allocated on POOL.
 205
 206    This function's behaviour differs from that of g_convert_with_fallback
 207    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 208    the input string is not valid in the declared input encoding.  This function
 209    however perseveres even in the presence of badly encoded input. */
 210 char *
 211 recode_string_pool (const char *to, const char *from,
 212                     const char *text, int length, struct pool *pool)
 213 {
 214   struct substring out;
 215
 216   if ( text == NULL )
 217     return NULL;
 218
 219   if ( length == -1 )
 220      length = strlen (text);
 221
 222   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 223   return out.string;
 224 }
 225
 226 /* Returns the name of the encoding that should be used for file names.
 227
 228    This is meant to be the same encoding used by g_filename_from_uri() and
 229    g_filename_to_uri() in GLib. */
 230 static const char *
 231 filename_encoding (void)
 232 {
 233 #if defined _WIN32 || defined __WIN32__
 234   return "UTF-8";
 235 #else
 236   return locale_charset ();
 237 #endif
 238 }
 239
 240 static char *
 241 xconcat2 (const char *a, size_t a_len,
 242           const char *b, size_t b_len)
 243 {
 244   char *s = xmalloc (a_len + b_len + 1);
 245   memcpy (s, a, a_len);
 246   memcpy (s + a_len, b, b_len);
 247   s[a_len + b_len] = '\0';
 248   return s;
 249 }
 250
 251 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 252    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 253    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 254    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 255    HEAD and tries again, repeating as necessary until the concatenated result
 256    fits or until HEAD_LEN reaches 0.
 257
 258    [*] Actually this function drops grapheme clusters instead of characters, so
 259        that, e.g. a Unicode character followed by a combining accent character
 260        is either completely included or completely excluded from HEAD_LEN.  See
 261        UAX #29 at http://unicode.org/reports/tr29/ for more information on
 262        grapheme clusters.
 263
 264    A null ENCODING is treated as UTF-8.
 265
 266    Sometimes this function has to actually construct the concatenated string to
 267    measure its length.  When this happens, it sets *RESULTP to that
 268    null-terminated string, allocated with malloc(), for the caller to use if it
 269    needs it.  Otherwise, it sets *RESULTP to NULL.
 270
 271    Simple examples for encoding="UTF-8", max_len=6:
 272
 273        head="abc",  tail="xyz"     => 3
 274        head="abcd", tail="xyz"     => 3 ("d" dropped).
 275        head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 276        head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 277
 278    Examples for encoding="ISO-8859-1", max_len=6:
 279
 280        head="éèä",  tail="xyz"     => 6
 281          (each letter in head is only 1 byte in ISO-8859-1 even though they
 282           each take 2 bytes in UTF-8 encoding)
 283 */
 284 static size_t
 285 utf8_encoding_concat__ (const char *head, size_t head_len,
 286                         const char *tail, size_t tail_len,
 287                         const char *encoding, size_t max_len,
 288                         char **resultp)
 289 {
 290   *resultp = NULL;
 291   if (head_len == 0)
 292     return 0;
 293   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 294     {
 295       if (head_len + tail_len <= max_len)
 296         return head_len;
 297       else if (tail_len >= max_len)
 298         return 0;
 299       else
 300         {
 301           size_t copy_len;
 302           ucs4_t prev;
 303           size_t ofs;
 304           int mblen;
 305
 306           copy_len = 0;
 307           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 308                                 head_len);
 309                ofs <= max_len - tail_len;
 310                ofs += mblen)
 311             {
 312               ucs4_t next;
 313
 314               mblen = u8_mbtouc (&next,
 315                                  CHAR_CAST (const uint8_t *, head + ofs),
 316                                  head_len - ofs);
 317               if (uc_is_grapheme_break (prev, next))
 318                 copy_len = ofs;
 319
 320               prev = next;
 321             }
 322
 323           return copy_len;
 324         }
 325     }
 326   else
 327     {
 328       char *result;
 329
 330       result = (tail_len > 0
 331                 ? xconcat2 (head, head_len, tail, tail_len)
 332                 : CONST_CAST (char *, head));
 333       if (recode_string_len (encoding, "UTF-8", result,
 334                              head_len + tail_len) <= max_len)
 335         {
 336           *resultp = result != head ? result : NULL;
 337           return head_len;
 338         }
 339       else
 340         {
 341           bool correct_result = false;
 342           size_t copy_len;
 343           ucs4_t prev;
 344           size_t ofs;
 345           int mblen;
 346
 347           copy_len = 0;
 348           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 349                                 head_len);
 350                ofs <= head_len;
 351                ofs += mblen)
 352             {
 353               ucs4_t next;
 354
 355               mblen = u8_mbtouc (&next,
 356                                  CHAR_CAST (const uint8_t *, head + ofs),
 357                                  head_len - ofs);
 358               if (uc_is_grapheme_break (prev, next))
 359                 {
 360                   if (result != head)
 361                     {
 362                       memcpy (result, head, ofs);
 363                       memcpy (result + ofs, tail, tail_len);
 364                       result[ofs + tail_len] = '\0';
 365                     }
 366
 367                   if (recode_string_len (encoding, "UTF-8", result,
 368                                          ofs + tail_len) <= max_len)
 369                     {
 370                       correct_result = true;
 371                       copy_len = ofs;
 372                     }
 373                   else
 374                     correct_result = false;
 375                 }
 376
 377               prev = next;
 378             }
 379
 380           if (result != head)
 381             {
 382               if (correct_result)
 383                 *resultp = result;
 384               else
 385                 free (result);
 386             }
 387
 388           return copy_len;
 389         }
 390     }
 391 }
 392
 393 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 394    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 395    string are all encoded in UTF-8.  As many characters[*] from the beginning
 396    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 397    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 398    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 399
 400    [*] Actually this function drops grapheme clusters instead of characters, so
 401        that, e.g. a Unicode character followed by a combining accent character
 402        is either completely included or completely excluded from the returned
 403        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 404        information on grapheme clusters.
 405
 406    A null ENCODING is treated as UTF-8.
 407
 408    Simple examples for encoding="UTF-8", max_len=6:
 409
 410        head="abc",  tail="xyz"     => "abcxyz"
 411        head="abcd", tail="xyz"     => "abcxyz"
 412        head="abc",  tail="uvwxyz"  => "uvwxyz"
 413        head="abc",  tail="tuvwxyz" => "tuvwxyz"
 414
 415    Examples for encoding="ISO-8859-1", max_len=6:
 416
 417        head="éèä",  tail="xyz"    => "éèäxyz"
 418          (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 419           each take 2 bytes in UTF-8 encoding)
 420 */
 421 char *
 422 utf8_encoding_concat (const char *head, const char *tail,
 423                       const char *encoding, size_t max_len)
 424 {
 425   size_t tail_len = strlen (tail);
 426   size_t prefix_len;
 427   char *result;
 428
 429   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 430                                        encoding, max_len, &result);
 431   return (result != NULL
 432           ? result
 433           : xconcat2 (head, prefix_len, tail, tail_len));
 434 }
 435
 436 /* Returns the length, in bytes, of the string that would be returned by
 437    utf8_encoding_concat() if passed the same arguments, but the implementation
 438    is often more efficient. */
 439 size_t
 440 utf8_encoding_concat_len (const char *head, const char *tail,
 441                           const char *encoding, size_t max_len)
 442 {
 443   size_t tail_len = strlen (tail);
 444   size_t prefix_len;
 445   char *result;
 446
 447   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 448                                        encoding, max_len, &result);
 449   free (result);
 450   return prefix_len + tail_len;
 451 }
 452
 453 /* Returns an allocated, null-terminated string, owned by the caller,
 454    containing as many characters[*] from the beginning of S that would fit
 455    within MAX_LEN bytes if the returned string were to be re-encoded in
 456    ENCODING.  Both S and the returned string are encoded in UTF-8.
 457
 458    [*] Actually this function drops grapheme clusters instead of characters, so
 459        that, e.g. a Unicode character followed by a combining accent character
 460        is either completely included or completely excluded from the returned
 461        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 462        information on grapheme clusters.
 463
 464    A null ENCODING is treated as UTF-8.
 465 */
 466 char *
 467 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 468 {
 469   return utf8_encoding_concat (s, "", encoding, max_len);
 470 }
 471
 472 /* Returns the length, in bytes, of the string that would be returned by
 473    utf8_encoding_trunc() if passed the same arguments, but the implementation
 474    is often more efficient. */
 475 size_t
 476 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 477 {
 478   return utf8_encoding_concat_len (s, "", encoding, max_len);
 479 }
 480
 481 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 482    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 483    current locale. */
 484 char *
 485 utf8_to_filename (const char *filename)
 486 {
 487   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 488 }
 489
 490 /* Returns FILENAME converted from the filename encoding to UTF-8.
 491    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 492    current locale. */
 493 char *
 494 filename_to_utf8 (const char *filename)
 495 {
 496   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 497 }
 498
 499 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 500    dynamically allocated string in TO-encoding.  Any characters which cannot be
 501    converted will be represented by '?'.
 502
 503    The returned string will be null-terminated and allocated on POOL.
 504
 505    This function's behaviour differs from that of g_convert_with_fallback
 506    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 507    the input string is not valid in the declared input encoding.  This function
 508    however perseveres even in the presence of badly encoded input. */
 509 struct substring
 510 recode_substring_pool (const char *to, const char *from,
 511                        struct substring text, struct pool *pool)
 512 {
 513   size_t outbufferlength;
 514   iconv_t conv ;
 515
 516   if (to == NULL)
 517     to = default_encoding;
 518
 519   if (from == NULL)
 520     from = default_encoding;
 521
 522   conv = create_iconv (to, from);
 523
 524   if ( (iconv_t) -1 == conv )
 525     {
 526       struct substring out;
 527       ss_alloc_substring_pool (&out, text, pool);
 528       return out;
 529     }
 530
 531   for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
 532     if ( outbufferlength > text.length)
 533       {
 534         char *output = pool_malloc (pool, outbufferlength);
 535         ssize_t output_len = try_recode (conv, text.string, text.length,
 536                                          output, outbufferlength);
 537         if (output_len >= 0)
 538           return ss_buffer (output, output_len);
 539         pool_free (pool, output);
 540       }
 541
 542   NOT_REACHED ();
 543 }
 544
 545 void
 546 i18n_init (void)
 547 {
 548   setlocale (LC_CTYPE, "");
 549   setlocale (LC_COLLATE, "");
 550   setlocale (LC_MESSAGES, "");
 551 #if HAVE_LC_PAPER
 552   setlocale (LC_PAPER, "");
 553 #endif
 554   bindtextdomain (PACKAGE, relocate(locale_dir));
 555   textdomain (PACKAGE);
 556
 557   assert (default_encoding == NULL);
 558   default_encoding = xstrdup (locale_charset ());
 559
 560   hmapx_init (&map);
 561 }
 562
 563 const char *
 564 get_default_encoding (void)
 565 {
 566   return default_encoding;
 567 }
 568
 569 void
 570 set_default_encoding (const char *enc)
 571 {
 572   free (default_encoding);
 573   default_encoding = xstrdup (enc);
 574 }
 575
 576
 577 /* Attempts to set the encoding from a locale name
 578    returns true if successfull.
 579    This function does not (should not!) alter the current locale.
 580 */
 581 bool
 582 set_encoding_from_locale (const char *loc)
 583 {
 584   bool ok = true;
 585   char *c_encoding;
 586   char *loc_encoding;
 587   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 588
 589   setlocale (LC_CTYPE, "C");
 590   c_encoding = xstrdup (locale_charset ());
 591
 592   setlocale (LC_CTYPE, loc);
 593   loc_encoding = xstrdup (locale_charset ());
 594
 595
 596   if ( 0 == strcmp (loc_encoding, c_encoding))
 597     {
 598       ok = false;
 599     }
 600
 601
 602   setlocale (LC_CTYPE, tmp);
 603
 604   free (tmp);
 605
 606   if (ok)
 607     {
 608       free (default_encoding);
 609       default_encoding = loc_encoding;
 610     }
 611   else
 612     free (loc_encoding);
 613
 614   free (c_encoding);
 615
 616   return ok;
 617 }
 618
 619 void
 620 i18n_done (void)
 621 {
 622   struct hmapx_node *node;
 623   struct converter *cvtr;
 624
 625   HMAPX_FOR_EACH (cvtr, node, &map)
 626     {
 627       free (cvtr->tocode);
 628       free (cvtr->fromcode);
 629       if (cvtr->conv != (iconv_t) -1)
 630         iconv_close (cvtr->conv);
 631       free (cvtr);
 632     }
 633
 634   hmapx_destroy (&map);
 635
 636   free (default_encoding);
 637   default_encoding = NULL;
 638 }
 639
 640
 641
 642 bool
 643 valid_encoding (const char *enc)
 644 {
 645   iconv_t conv = iconv_open (UTF8, enc);
 646
 647   if ( conv == (iconv_t) -1)
 648     return false;
 649
 650   iconv_close (conv);
 651
 652   return true;
 653 }
 654
 655
 656 /* Return the system local's idea of the
 657    decimal seperator character */
 658 char
 659 get_system_decimal (void)
 660 {
 661   char radix_char;
 662
 663   char *ol = xstrdup (setlocale (LC_NUMERIC, NULL));
 664   setlocale (LC_NUMERIC, "");
 665
 666 #if HAVE_NL_LANGINFO
 667   radix_char = nl_langinfo (RADIXCHAR)[0];
 668 #else
 669   {
 670     char buf[10];
 671     snprintf (buf, sizeof buf, "%f", 2.5);
 672     radix_char = buf[1];
 673   }
 674 #endif
 675
 676   /* We MUST leave LC_NUMERIC untouched, since it would
 677      otherwise interfere with data_{in,out} */
 678   setlocale (LC_NUMERIC, ol);
 679   free (ol);
 680   return radix_char;
 681 }
 682
 683 const char *
 684 uc_name (ucs4_t uc, char buffer[16])
 685 {
 686   if (uc >= 0x20 && uc < 0x7f)
 687     snprintf (buffer, 16, "`%c'", uc);
 688   else
 689     snprintf (buffer, 16, "U+%04X", uc);
 690   return buffer;
 691 }
 692 \f
 693 bool
 694 get_encoding_info (struct encoding_info *e, const char *name)
 695 {
 696   const struct substring in = SS_LITERAL_INITIALIZER (
 697     "\t\n\v\f\r "
 698     "!\"#$%&'()*+,-./0123456789:;<=>?@"
 699     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 700     "abcdefghijklmnopqrstuvwxyz{|}~");
 701
 702   struct substring out, cr, lf, space;
 703   bool ok;
 704
 705   memset (e, 0, sizeof *e);
 706
 707   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 708   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 709   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 710   ok = (cr.length >= 1
 711         && cr.length <= MAX_UNIT
 712         && cr.length == lf.length
 713         && cr.length == space.length);
 714   if (!ok)
 715     {
 716       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 717       ss_dealloc (&cr);
 718       ss_dealloc (&lf);
 719       ss_dealloc (&space);
 720       ss_alloc_substring (&cr, ss_cstr ("\r"));
 721       ss_alloc_substring (&lf, ss_cstr ("\n"));
 722       ss_alloc_substring (&space, ss_cstr (" "));
 723     }
 724
 725   e->unit = cr.length;
 726   memcpy (e->cr, cr.string, e->unit);
 727   memcpy (e->lf, lf.string, e->unit);
 728   memcpy (e->space, space.string, e->unit);
 729
 730   ss_dealloc (&cr);
 731   ss_dealloc (&lf);
 732   ss_dealloc (&space);
 733
 734   out = recode_substring_pool ("UTF-8", name, in, NULL);
 735   e->is_ascii_compatible = ss_equals (in, out);
 736   ss_dealloc (&out);
 737
 738   if (!e->is_ascii_compatible && e->unit == 1)
 739     {
 740       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 741       e->is_ebcdic_compatible = (out.length == 1
 742                                  && (uint8_t) out.string[0] == 0xc1);
 743       ss_dealloc (&out);
 744     }
 745   else
 746     e->is_ebcdic_compatible = false;
 747
 748   return ok;
 749 }
 750
 751 bool
 752 is_encoding_ascii_compatible (const char *encoding)
 753 {
 754   struct encoding_info e;
 755
 756   get_encoding_info (&e, encoding);
 757   return e.is_ascii_compatible;
 758 }
 759
 760 bool
 761 is_encoding_ebcdic_compatible (const char *encoding)
 762 {
 763   struct encoding_info e;
 764
 765   get_encoding_info (&e, encoding);
 766   return e.is_ebcdic_compatible;
 767 }
 768
 769 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 770    otherwise false. */
 771 bool
 772 is_encoding_supported (const char *encoding)
 773 {
 774   return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
 775           && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
 776 }
 777
 778 /* Returns true if E is the name of a UTF-8 encoding.
 779
 780    XXX Possibly we should test not E as a string but its properties via
 781    iconv. */
 782 bool
 783 is_encoding_utf8 (const char *e)
 784 {
 785   return ((e[0] == 'u' || e[0] == 'U')
 786           && (e[1] == 't' || e[1] == 'T')
 787           && (e[2] == 'f' || e[2] == 'F')
 788           && ((e[3] == '8' && e[4] == '\0')
 789               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 790 }
 791 \f
 792 static struct encoding_category *categories;
 793 static int n_categories;
 794
 795 static void SENTINEL (0)
 796 add_category (size_t *allocated_categories, const char *category, ...)
 797 {
 798   struct encoding_category *c;
 799   const char *encodings[16];
 800   va_list args;
 801   int i, n;
 802
 803   /* Count encoding arguments. */
 804   va_start (args, category);
 805   n = 0;
 806   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 807     {
 808       const char *encoding = encodings[n];
 809       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 810         n++;
 811     }
 812   assert (n < sizeof encodings / sizeof *encodings);
 813   va_end (args);
 814
 815   if (n == 0)
 816     return;
 817
 818   if (n_categories >= *allocated_categories)
 819     categories = x2nrealloc (categories,
 820                              allocated_categories, sizeof *categories);
 821
 822   c = &categories[n_categories++];
 823   c->category = category;
 824   c->encodings = xmalloc (n * sizeof *c->encodings);
 825   for (i = 0; i < n; i++)
 826     c->encodings[i] = encodings[i];
 827   c->n_encodings = n;
 828 }
 829
 830 static void
 831 init_encoding_categories (void)
 832 {
 833   static bool inited;
 834   size_t alloc;
 835
 836   if (inited)
 837     return;
 838   inited = true;
 839
 840   alloc = 0;
 841   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
 842                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
 843   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
 844                 NULL_SENTINEL);
 845   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
 846   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
 847                 "Windows-1257", NULL_SENTINEL);
 848   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
 849   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
 850                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
 851   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
 852                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
 853   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
 854                 "EUC-TW", NULL_SENTINEL);
 855   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
 856   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
 857                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
 858   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
 859   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
 860                 NULL_SENTINEL);
 861   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
 862   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
 863   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
 864   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
 865   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
 866                 NULL_SENTINEL);
 867   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
 868   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
 869   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
 870   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
 871                 NULL_SENTINEL);
 872   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
 873                 NULL_SENTINEL);
 874   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
 875   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
 876                 NULL_SENTINEL);
 877   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
 878   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
 879                 NULL_SENTINEL);
 880   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
 881                 NULL_SENTINEL);
 882   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
 883                 "Windows-1258", NULL_SENTINEL);
 884   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
 885                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
 886 }
 887
 888 /* Returns an array of "struct encoding_category" that contains only the
 889    categories and encodings that the system supports. */
 890 struct encoding_category *
 891 get_encoding_categories (void)
 892 {
 893   init_encoding_categories ();
 894   return categories;
 895 }
 896
 897 /* Returns the number of elements in the array returned by
 898    get_encoding_categories().  */
 899 size_t
 900 get_n_encoding_categories (void)
 901 {
 902   init_encoding_categories ();
 903   return n_categories;
 904 }