pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <libintl.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unicase.h>
  31 #include <unigbrk.h>
  32
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/compiler.h"
  35 #include "libpspp/hmapx.h"
  36 #include "libpspp/hash-functions.h"
  37 #include "libpspp/pool.h"
  38 #include "libpspp/str.h"
  39 #include "libpspp/version.h"
  40
  41 #include "gl/c-strcase.h"
  42 #include "gl/localcharset.h"
  43 #include "gl/minmax.h"
  44 #include "gl/xalloc.h"
  45 #include "gl/relocatable.h"
  46 #include "gl/xstrndup.h"
  47
  48 #include "gettext.h"
  49 #define _(msgid) gettext (msgid)
  50
  51 struct converter
  52  {
  53     char *tocode;
  54     char *fromcode;
  55     iconv_t conv;
  56     int error;
  57   };
  58
  59 static char *default_encoding;
  60 static struct hmapx map;
  61
  62 /* A wrapper around iconv_open */
  63 static struct converter *
  64 create_iconv__ (const char* tocode, const char* fromcode)
  65 {
  66   size_t hash;
  67   struct hmapx_node *node;
  68   struct converter *converter;
  69   assert (fromcode);
  70
  71   hash = hash_string (tocode, hash_string (fromcode, 0));
  72   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  73     if (!strcmp (tocode, converter->tocode)
  74         && !strcmp (fromcode, converter->fromcode))
  75       return converter;
  76
  77   converter = xmalloc (sizeof *converter);
  78   converter->tocode = xstrdup (tocode);
  79   converter->fromcode = xstrdup (fromcode);
  80   converter->conv = iconv_open (tocode, fromcode);
  81   converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
  82   hmapx_insert (&map, converter, hash);
  83
  84   return converter;
  85 }
  86
  87 static iconv_t
  88 create_iconv (const char* tocode, const char* fromcode)
  89 {
  90   struct converter *converter;
  91
  92   converter = create_iconv__ (tocode, fromcode);
  93
  94   /* I don't think it's safe to translate this string or to use messaging
  95      as the converters have not yet been set up */
  96   if (converter->error && strcmp (tocode, fromcode))
  97     {
  98       fprintf (stderr,
  99                "Warning: "
 100                "cannot create a converter for `%s' to `%s': %s\n",
 101                fromcode, tocode, strerror (converter->error));
 102       converter->error = 0;
 103     }
 104
 105   return converter->conv;
 106 }
 107
 108 /* Converts the single byte C from encoding FROM to TO, returning the first
 109    byte of the result.
 110
 111    This function probably shouldn't be used at all, but some code still does
 112    use it. */
 113 char
 114 recode_byte (const char *to, const char *from, char c)
 115 {
 116   char x;
 117   char *s = recode_string (to, from, &c, 1);
 118   x = s[0];
 119   free (s);
 120   return x;
 121 }
 122
 123 /* Similar to recode_string_pool, but allocates the returned value on the heap
 124    instead of in a pool.  It is the caller's responsibility to free the
 125    returned value. */
 126 char *
 127 recode_string (const char *to, const char *from,
 128                const char *text, int length)
 129 {
 130   return recode_string_pool (to, from, text, length, NULL);
 131 }
 132
 133 /* Returns the length, in bytes, of the string that a similar recode_string()
 134    call would return. */
 135 size_t
 136 recode_string_len (const char *to, const char *from,
 137                    const char *text, int length)
 138 {
 139   char *s = recode_string (to, from, text, length);
 140   size_t len = strlen (s);
 141   free (s);
 142   return len;
 143 }
 144
 145 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 146    at OP, and appends a null terminator to the output.
 147
 148    Returns the output length if successful, -1 if the output buffer is too
 149    small. */
 150 static ssize_t
 151 try_recode (iconv_t conv,
 152             const char *ip, size_t inbytes,
 153             char *op_, size_t outbytes)
 154 {
 155   /* FIXME: Need to ensure that this char is valid in the target encoding */
 156   const char fallbackchar = '?';
 157   char *op = op_;
 158
 159   /* Put the converter into the initial shift state, in case there was any
 160      state information left over from its last usage. */
 161   iconv (conv, NULL, 0, NULL, 0);
 162
 163   while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
 164                 &op, &outbytes) == -1)
 165     switch (errno)
 166       {
 167       case EINVAL:
 168         if (outbytes < 2)
 169           return -1;
 170         *op++ = fallbackchar;
 171         *op = '\0';
 172         return op - op_;
 173
 174       case EILSEQ:
 175         if (outbytes == 0)
 176           return -1;
 177         *op++ = fallbackchar;
 178         outbytes--;
 179         ip++;
 180         inbytes--;
 181         break;
 182
 183       case E2BIG:
 184         return -1;
 185
 186       default:
 187         /* should never happen */
 188         fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
 189         NOT_REACHED ();
 190         break;
 191       }
 192
 193   if (outbytes == 0)
 194     return -1;
 195
 196   *op = '\0';
 197   return op - op_;
 198 }
 199
 200 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 201    dynamically allocated string in TO-encoding.  Any characters which cannot be
 202    converted will be represented by '?'.
 203
 204    LENGTH should be the length of the string or -1, if null terminated.
 205
 206    The returned string will be allocated on POOL.
 207
 208    This function's behaviour differs from that of g_convert_with_fallback
 209    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 210    the input string is not valid in the declared input encoding.  This function
 211    however perseveres even in the presence of badly encoded input. */
 212 char *
 213 recode_string_pool (const char *to, const char *from,
 214                     const char *text, int length, struct pool *pool)
 215 {
 216   struct substring out;
 217
 218   if ( text == NULL )
 219     return NULL;
 220
 221   if ( length == -1 )
 222      length = strlen (text);
 223
 224   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 225   return out.string;
 226 }
 227
 228 /* Returns the name of the encoding that should be used for file names.
 229
 230    This is meant to be the same encoding used by g_filename_from_uri() and
 231    g_filename_to_uri() in GLib. */
 232 static const char *
 233 filename_encoding (void)
 234 {
 235 #if defined _WIN32 || defined __WIN32__
 236   return "UTF-8";
 237 #else
 238   return locale_charset ();
 239 #endif
 240 }
 241
 242 static char *
 243 xconcat2 (const char *a, size_t a_len,
 244           const char *b, size_t b_len)
 245 {
 246   char *s = xmalloc (a_len + b_len + 1);
 247   memcpy (s, a, a_len);
 248   memcpy (s + a_len, b, b_len);
 249   s[a_len + b_len] = '\0';
 250   return s;
 251 }
 252
 253 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 254    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 255    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 256    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 257    HEAD and tries again, repeating as necessary until the concatenated result
 258    fits or until HEAD_LEN reaches 0.
 259
 260    [*] Actually this function drops grapheme clusters instead of characters, so
 261        that, e.g. a Unicode character followed by a combining accent character
 262        is either completely included or completely excluded from HEAD_LEN.  See
 263        UAX #29 at http://unicode.org/reports/tr29/ for more information on
 264        grapheme clusters.
 265
 266    A null ENCODING is treated as UTF-8.
 267
 268    Sometimes this function has to actually construct the concatenated string to
 269    measure its length.  When this happens, it sets *RESULTP to that
 270    null-terminated string, allocated with malloc(), for the caller to use if it
 271    needs it.  Otherwise, it sets *RESULTP to NULL.
 272
 273    Simple examples for encoding="UTF-8", max_len=6:
 274
 275        head="abc",  tail="xyz"     => 3
 276        head="abcd", tail="xyz"     => 3 ("d" dropped).
 277        head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 278        head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 279
 280    Examples for encoding="ISO-8859-1", max_len=6:
 281
 282        head="éèä",  tail="xyz"     => 6
 283          (each letter in head is only 1 byte in ISO-8859-1 even though they
 284           each take 2 bytes in UTF-8 encoding)
 285 */
 286 static size_t
 287 utf8_encoding_concat__ (const char *head, size_t head_len,
 288                         const char *tail, size_t tail_len,
 289                         const char *encoding, size_t max_len,
 290                         char **resultp)
 291 {
 292   *resultp = NULL;
 293   if (head_len == 0)
 294     return 0;
 295   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 296     {
 297       if (head_len + tail_len <= max_len)
 298         return head_len;
 299       else if (tail_len >= max_len)
 300         return 0;
 301       else
 302         {
 303           size_t copy_len;
 304           ucs4_t prev;
 305           size_t ofs;
 306           int mblen;
 307
 308           copy_len = 0;
 309           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 310                                 head_len);
 311                ofs <= max_len - tail_len;
 312                ofs += mblen)
 313             {
 314               ucs4_t next;
 315
 316               mblen = u8_mbtouc (&next,
 317                                  CHAR_CAST (const uint8_t *, head + ofs),
 318                                  head_len - ofs);
 319               if (uc_is_grapheme_break (prev, next))
 320                 copy_len = ofs;
 321
 322               prev = next;
 323             }
 324
 325           return copy_len;
 326         }
 327     }
 328   else
 329     {
 330       char *result;
 331
 332       result = (tail_len > 0
 333                 ? xconcat2 (head, head_len, tail, tail_len)
 334                 : CONST_CAST (char *, head));
 335       if (recode_string_len (encoding, "UTF-8", result,
 336                              head_len + tail_len) <= max_len)
 337         {
 338           *resultp = result != head ? result : NULL;
 339           return head_len;
 340         }
 341       else
 342         {
 343           bool correct_result = false;
 344           size_t copy_len;
 345           ucs4_t prev;
 346           size_t ofs;
 347           int mblen;
 348
 349           copy_len = 0;
 350           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 351                                 head_len);
 352                ofs <= head_len;
 353                ofs += mblen)
 354             {
 355               ucs4_t next;
 356
 357               mblen = u8_mbtouc (&next,
 358                                  CHAR_CAST (const uint8_t *, head + ofs),
 359                                  head_len - ofs);
 360               if (uc_is_grapheme_break (prev, next))
 361                 {
 362                   if (result != head)
 363                     {
 364                       memcpy (result, head, ofs);
 365                       memcpy (result + ofs, tail, tail_len);
 366                       result[ofs + tail_len] = '\0';
 367                     }
 368
 369                   if (recode_string_len (encoding, "UTF-8", result,
 370                                          ofs + tail_len) <= max_len)
 371                     {
 372                       correct_result = true;
 373                       copy_len = ofs;
 374                     }
 375                   else
 376                     correct_result = false;
 377                 }
 378
 379               prev = next;
 380             }
 381
 382           if (result != head)
 383             {
 384               if (correct_result)
 385                 *resultp = result;
 386               else
 387                 free (result);
 388             }
 389
 390           return copy_len;
 391         }
 392     }
 393 }
 394
 395 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 396    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 397    string are all encoded in UTF-8.  As many characters[*] from the beginning
 398    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 399    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 400    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 401
 402    [*] Actually this function drops grapheme clusters instead of characters, so
 403        that, e.g. a Unicode character followed by a combining accent character
 404        is either completely included or completely excluded from the returned
 405        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 406        information on grapheme clusters.
 407
 408    A null ENCODING is treated as UTF-8.
 409
 410    Simple examples for encoding="UTF-8", max_len=6:
 411
 412        head="abc",  tail="xyz"     => "abcxyz"
 413        head="abcd", tail="xyz"     => "abcxyz"
 414        head="abc",  tail="uvwxyz"  => "uvwxyz"
 415        head="abc",  tail="tuvwxyz" => "tuvwxyz"
 416
 417    Examples for encoding="ISO-8859-1", max_len=6:
 418
 419        head="éèä",  tail="xyz"    => "éèäxyz"
 420          (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 421           each take 2 bytes in UTF-8 encoding)
 422 */
 423 char *
 424 utf8_encoding_concat (const char *head, const char *tail,
 425                       const char *encoding, size_t max_len)
 426 {
 427   size_t tail_len = strlen (tail);
 428   size_t prefix_len;
 429   char *result;
 430
 431   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 432                                        encoding, max_len, &result);
 433   return (result != NULL
 434           ? result
 435           : xconcat2 (head, prefix_len, tail, tail_len));
 436 }
 437
 438 /* Returns the length, in bytes, of the string that would be returned by
 439    utf8_encoding_concat() if passed the same arguments, but the implementation
 440    is often more efficient. */
 441 size_t
 442 utf8_encoding_concat_len (const char *head, const char *tail,
 443                           const char *encoding, size_t max_len)
 444 {
 445   size_t tail_len = strlen (tail);
 446   size_t prefix_len;
 447   char *result;
 448
 449   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 450                                        encoding, max_len, &result);
 451   free (result);
 452   return prefix_len + tail_len;
 453 }
 454
 455 /* Returns an allocated, null-terminated string, owned by the caller,
 456    containing as many characters[*] from the beginning of S that would fit
 457    within MAX_LEN bytes if the returned string were to be re-encoded in
 458    ENCODING.  Both S and the returned string are encoded in UTF-8.
 459
 460    [*] Actually this function drops grapheme clusters instead of characters, so
 461        that, e.g. a Unicode character followed by a combining accent character
 462        is either completely included or completely excluded from the returned
 463        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 464        information on grapheme clusters.
 465
 466    A null ENCODING is treated as UTF-8.
 467 */
 468 char *
 469 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 470 {
 471   return utf8_encoding_concat (s, "", encoding, max_len);
 472 }
 473
 474 /* Returns the length, in bytes, of the string that would be returned by
 475    utf8_encoding_trunc() if passed the same arguments, but the implementation
 476    is often more efficient. */
 477 size_t
 478 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 479 {
 480   return utf8_encoding_concat_len (s, "", encoding, max_len);
 481 }
 482
 483 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 484    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 485    current locale. */
 486 char *
 487 utf8_to_filename (const char *filename)
 488 {
 489   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 490 }
 491
 492 /* Returns FILENAME converted from the filename encoding to UTF-8.
 493    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 494    current locale. */
 495 char *
 496 filename_to_utf8 (const char *filename)
 497 {
 498   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 499 }
 500
 501 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 502    dynamically allocated string in TO-encoding.  Any characters which cannot be
 503    converted will be represented by '?'.
 504
 505    The returned string will be null-terminated and allocated on POOL with
 506    pool_malloc().
 507
 508    This function's behaviour differs from that of g_convert_with_fallback
 509    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 510    the input string is not valid in the declared input encoding.  This function
 511    however perseveres even in the presence of badly encoded input. */
 512 struct substring
 513 recode_substring_pool (const char *to, const char *from,
 514                        struct substring text, struct pool *pool)
 515 {
 516   size_t outbufferlength;
 517   iconv_t conv ;
 518
 519   if (to == NULL)
 520     to = default_encoding;
 521
 522   if (from == NULL)
 523     from = default_encoding;
 524
 525   conv = create_iconv (to, from);
 526
 527   if ( (iconv_t) -1 == conv )
 528     {
 529       struct substring out;
 530
 531       out.string = pool_malloc (pool, text.length + 1);
 532       out.length = text.length;
 533       memcpy (out.string, text.string, text.length);
 534       out.string[out.length] = '\0';
 535       return out;
 536     }
 537
 538   for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
 539     if ( outbufferlength > text.length)
 540       {
 541         char *output = pool_malloc (pool, outbufferlength);
 542         ssize_t output_len = try_recode (conv, text.string, text.length,
 543                                          output, outbufferlength);
 544         if (output_len >= 0)
 545           return ss_buffer (output, output_len);
 546         pool_free (pool, output);
 547       }
 548
 549   NOT_REACHED ();
 550 }
 551
 552 void
 553 i18n_init (void)
 554 {
 555   setlocale (LC_ALL, "");
 556   bindtextdomain (PACKAGE, relocate(locale_dir));
 557   textdomain (PACKAGE);
 558
 559   assert (default_encoding == NULL);
 560   default_encoding = xstrdup (locale_charset ());
 561
 562   hmapx_init (&map);
 563 }
 564
 565 const char *
 566 get_default_encoding (void)
 567 {
 568   return default_encoding;
 569 }
 570
 571 void
 572 set_default_encoding (const char *enc)
 573 {
 574   free (default_encoding);
 575   default_encoding = xstrdup (enc);
 576 }
 577
 578
 579 /* Attempts to set the encoding from a locale name
 580    returns true if successfull.
 581    This function does not (should not!) alter the current locale.
 582 */
 583 bool
 584 set_encoding_from_locale (const char *loc)
 585 {
 586   bool ok = true;
 587   char *c_encoding;
 588   char *loc_encoding;
 589   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 590
 591   setlocale (LC_CTYPE, "C");
 592   c_encoding = xstrdup (locale_charset ());
 593
 594   setlocale (LC_CTYPE, loc);
 595   loc_encoding = xstrdup (locale_charset ());
 596
 597
 598   if ( 0 == strcmp (loc_encoding, c_encoding))
 599     {
 600       ok = false;
 601     }
 602
 603   setlocale (LC_CTYPE, tmp);
 604
 605   free (tmp);
 606
 607   if (ok)
 608     {
 609       free (default_encoding);
 610       default_encoding = loc_encoding;
 611     }
 612   else
 613     free (loc_encoding);
 614
 615   free (c_encoding);
 616
 617   return ok;
 618 }
 619
 620 void
 621 i18n_done (void)
 622 {
 623   struct hmapx_node *node;
 624   struct converter *cvtr;
 625
 626   HMAPX_FOR_EACH (cvtr, node, &map)
 627     {
 628       free (cvtr->tocode);
 629       free (cvtr->fromcode);
 630       if (cvtr->conv != (iconv_t) -1)
 631         iconv_close (cvtr->conv);
 632       free (cvtr);
 633     }
 634
 635   hmapx_destroy (&map);
 636
 637   free (default_encoding);
 638   default_encoding = NULL;
 639 }
 640
 641
 642
 643 bool
 644 valid_encoding (const char *enc)
 645 {
 646   iconv_t conv = iconv_open (UTF8, enc);
 647
 648   if ( conv == (iconv_t) -1)
 649     return false;
 650
 651   iconv_close (conv);
 652
 653   return true;
 654 }
 655
 656
 657 /* Return the system local's idea of the
 658    decimal seperator character */
 659 char
 660 get_system_decimal (void)
 661 {
 662   char radix_char;
 663
 664 #if HAVE_NL_LANGINFO
 665   radix_char = nl_langinfo (RADIXCHAR)[0];
 666 #else
 667   {
 668     char buf[10];
 669     snprintf (buf, sizeof buf, "%f", 2.5);
 670     radix_char = buf[1];
 671   }
 672 #endif
 673
 674   return radix_char;
 675 }
 676
 677 const char *
 678 uc_name (ucs4_t uc, char buffer[16])
 679 {
 680   if (uc >= 0x20 && uc < 0x7f)
 681     snprintf (buffer, 16, "`%c'", uc);
 682   else
 683     snprintf (buffer, 16, "U+%04X", uc);
 684   return buffer;
 685 }
 686 \f
 687 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 688
 689 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 690    with lowercase and uppercase letters treated as equal, starting from
 691    BASIS. */
 692 unsigned int
 693 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 694 {
 695   uint8_t folded_buf[2048];
 696   size_t folded_len = sizeof folded_buf;
 697   uint8_t *folded_s;
 698   unsigned int hash;
 699
 700   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 701                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 702   if (folded_s != NULL)
 703     {
 704       hash = hash_bytes (folded_s, folded_len, basis);
 705       if (folded_s != folded_buf)
 706         free (folded_s);
 707     }
 708   else
 709     {
 710       if (errno == ENOMEM)
 711         xalloc_die ();
 712       hash = hash_bytes (s, n, basis);
 713     }
 714
 715   return hash;
 716 }
 717
 718 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 719    uppercase letters treated as equal, starting from BASIS. */
 720 unsigned int
 721 utf8_hash_case_string (const char *s, unsigned int basis)
 722 {
 723   return utf8_hash_case_bytes (s, strlen (s), basis);
 724 }
 725
 726 /* Compares UTF-8 strings A and B case-insensitively.
 727    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 728 int
 729 utf8_strcasecmp (const char *a, const char *b)
 730 {
 731   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 732 }
 733
 734 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 735    case-insensitively.
 736    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 737 int
 738 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 739 {
 740   int result;
 741
 742   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 743                   CHAR_CAST (const uint8_t *, b), bn,
 744                   NULL, UNINORM_NFKD, &result))
 745     {
 746       if (errno == ENOMEM)
 747         xalloc_die ();
 748
 749       result = memcmp (a, b, MIN (an, bn));
 750       if (result == 0)
 751         result = an < bn ? -1 : an > bn;
 752     }
 753
 754   return result;
 755 }
 756
 757 static char *
 758 utf8_casemap (const char *s,
 759               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 760                              uint8_t *, size_t *))
 761 {
 762   char *result;
 763   size_t size;
 764
 765   result = CHAR_CAST (char *,
 766                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 767                          NULL, NULL, NULL, &size));
 768   if (result == NULL)
 769     {
 770       if (errno == ENOMEM)
 771         xalloc_die ();
 772
 773       result = xstrdup (s);
 774     }
 775   return result;
 776 }
 777
 778 char *
 779 utf8_to_upper (const char *s)
 780 {
 781   return utf8_casemap (s, u8_toupper);
 782 }
 783
 784 char *
 785 utf8_to_lower (const char *s)
 786 {
 787   return utf8_casemap (s, u8_tolower);
 788 }
 789 \f
 790 bool
 791 get_encoding_info (struct encoding_info *e, const char *name)
 792 {
 793   const struct substring in = SS_LITERAL_INITIALIZER (
 794     "\t\n\v\f\r "
 795     "!\"#$%&'()*+,-./0123456789:;<=>?@"
 796     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 797     "abcdefghijklmnopqrstuvwxyz{|}~");
 798
 799   struct substring out, cr, lf, space;
 800   bool ok;
 801
 802   memset (e, 0, sizeof *e);
 803
 804   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 805   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 806   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 807   ok = (cr.length >= 1
 808         && cr.length <= MAX_UNIT
 809         && cr.length == lf.length
 810         && cr.length == space.length);
 811   if (!ok)
 812     {
 813       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 814       ss_dealloc (&cr);
 815       ss_dealloc (&lf);
 816       ss_dealloc (&space);
 817       ss_alloc_substring (&cr, ss_cstr ("\r"));
 818       ss_alloc_substring (&lf, ss_cstr ("\n"));
 819       ss_alloc_substring (&space, ss_cstr (" "));
 820     }
 821
 822   e->unit = cr.length;
 823   memcpy (e->cr, cr.string, e->unit);
 824   memcpy (e->lf, lf.string, e->unit);
 825   memcpy (e->space, space.string, e->unit);
 826
 827   ss_dealloc (&cr);
 828   ss_dealloc (&lf);
 829   ss_dealloc (&space);
 830
 831   out = recode_substring_pool ("UTF-8", name, in, NULL);
 832   e->is_ascii_compatible = ss_equals (in, out);
 833   ss_dealloc (&out);
 834
 835   if (!e->is_ascii_compatible && e->unit == 1)
 836     {
 837       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 838       e->is_ebcdic_compatible = (out.length == 1
 839                                  && (uint8_t) out.string[0] == 0xc1);
 840       ss_dealloc (&out);
 841     }
 842   else
 843     e->is_ebcdic_compatible = false;
 844
 845   return ok;
 846 }
 847
 848 bool
 849 is_encoding_ascii_compatible (const char *encoding)
 850 {
 851   struct encoding_info e;
 852
 853   get_encoding_info (&e, encoding);
 854   return e.is_ascii_compatible;
 855 }
 856
 857 bool
 858 is_encoding_ebcdic_compatible (const char *encoding)
 859 {
 860   struct encoding_info e;
 861
 862   get_encoding_info (&e, encoding);
 863   return e.is_ebcdic_compatible;
 864 }
 865
 866 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 867    otherwise false. */
 868 bool
 869 is_encoding_supported (const char *encoding)
 870 {
 871   return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
 872           && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
 873 }
 874
 875 /* Returns true if E is the name of a UTF-8 encoding.
 876
 877    XXX Possibly we should test not E as a string but its properties via
 878    iconv. */
 879 bool
 880 is_encoding_utf8 (const char *e)
 881 {
 882   return ((e[0] == 'u' || e[0] == 'U')
 883           && (e[1] == 't' || e[1] == 'T')
 884           && (e[2] == 'f' || e[2] == 'F')
 885           && ((e[3] == '8' && e[4] == '\0')
 886               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 887 }
 888 \f
 889 static struct encoding_category *categories;
 890 static int n_categories;
 891
 892 static void SENTINEL (0)
 893 add_category (size_t *allocated_categories, const char *category, ...)
 894 {
 895   struct encoding_category *c;
 896   const char *encodings[16];
 897   va_list args;
 898   int i, n;
 899
 900   /* Count encoding arguments. */
 901   va_start (args, category);
 902   n = 0;
 903   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 904     {
 905       const char *encoding = encodings[n];
 906       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 907         n++;
 908     }
 909   assert (n < sizeof encodings / sizeof *encodings);
 910   va_end (args);
 911
 912   if (n == 0)
 913     return;
 914
 915   if (n_categories >= *allocated_categories)
 916     categories = x2nrealloc (categories,
 917                              allocated_categories, sizeof *categories);
 918
 919   c = &categories[n_categories++];
 920   c->category = category;
 921   c->encodings = xmalloc (n * sizeof *c->encodings);
 922   for (i = 0; i < n; i++)
 923     c->encodings[i] = encodings[i];
 924   c->n_encodings = n;
 925 }
 926
 927 static void
 928 init_encoding_categories (void)
 929 {
 930   static bool inited;
 931   size_t alloc;
 932
 933   if (inited)
 934     return;
 935   inited = true;
 936
 937   alloc = 0;
 938   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
 939                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
 940   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
 941                 NULL_SENTINEL);
 942   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
 943   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
 944                 "Windows-1257", NULL_SENTINEL);
 945   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
 946   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
 947                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
 948   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
 949                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
 950   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
 951                 "EUC-TW", NULL_SENTINEL);
 952   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
 953   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
 954                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
 955   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
 956   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
 957                 NULL_SENTINEL);
 958   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
 959   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
 960   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
 961   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
 962   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
 963                 NULL_SENTINEL);
 964   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
 965   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
 966   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
 967   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
 968                 NULL_SENTINEL);
 969   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
 970                 NULL_SENTINEL);
 971   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
 972   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
 973                 NULL_SENTINEL);
 974   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
 975   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
 976                 NULL_SENTINEL);
 977   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
 978                 NULL_SENTINEL);
 979   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
 980                 "Windows-1258", NULL_SENTINEL);
 981   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
 982                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
 983 }
 984
 985 /* Returns an array of "struct encoding_category" that contains only the
 986    categories and encodings that the system supports. */
 987 struct encoding_category *
 988 get_encoding_categories (void)
 989 {
 990   init_encoding_categories ();
 991   return categories;
 992 }
 993
 994 /* Returns the number of elements in the array returned by
 995    get_encoding_categories().  */
 996 size_t
 997 get_n_encoding_categories (void)
 998 {
 999   init_encoding_categories ();
1000   return n_categories;
1001 }