pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <libintl.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unicase.h>
  31 #include <unigbrk.h>
  32
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/compiler.h"
  35 #include "libpspp/hmapx.h"
  36 #include "libpspp/hash-functions.h"
  37 #include "libpspp/pool.h"
  38 #include "libpspp/str.h"
  39 #include "libpspp/version.h"
  40
  41 #include "gl/c-strcase.h"
  42 #include "gl/localcharset.h"
  43 #include "gl/minmax.h"
  44 #include "gl/xalloc.h"
  45 #include "gl/relocatable.h"
  46 #include "gl/xstrndup.h"
  47
  48 #include "gettext.h"
  49 #define _(msgid) gettext (msgid)
  50
  51 struct converter
  52  {
  53     char *tocode;
  54     char *fromcode;
  55     iconv_t conv;
  56     int error;
  57   };
  58
  59 static char *default_encoding;
  60 static struct hmapx map;
  61
  62 /* A wrapper around iconv_open */
  63 static struct converter *
  64 create_iconv__ (const char* tocode, const char* fromcode)
  65 {
  66   size_t hash;
  67   struct hmapx_node *node;
  68   struct converter *converter;
  69   assert (fromcode);
  70
  71   hash = hash_string (tocode, hash_string (fromcode, 0));
  72   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  73     if (!strcmp (tocode, converter->tocode)
  74         && !strcmp (fromcode, converter->fromcode))
  75       return converter;
  76
  77   converter = xmalloc (sizeof *converter);
  78   converter->tocode = xstrdup (tocode);
  79   converter->fromcode = xstrdup (fromcode);
  80   converter->conv = iconv_open (tocode, fromcode);
  81   converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
  82   hmapx_insert (&map, converter, hash);
  83
  84   return converter;
  85 }
  86
  87 static iconv_t
  88 create_iconv (const char* tocode, const char* fromcode)
  89 {
  90   struct converter *converter;
  91
  92   converter = create_iconv__ (tocode, fromcode);
  93
  94   /* I don't think it's safe to translate this string or to use messaging
  95      as the converters have not yet been set up */
  96   if (converter->error && strcmp (tocode, fromcode))
  97     {
  98       fprintf (stderr,
  99                "Warning: "
 100                "cannot create a converter for `%s' to `%s': %s\n",
 101                fromcode, tocode, strerror (converter->error));
 102       converter->error = 0;
 103     }
 104
 105   return converter->conv;
 106 }
 107
 108 /* Converts the single byte C from encoding FROM to TO, returning the first
 109    byte of the result.
 110
 111    This function probably shouldn't be used at all, but some code still does
 112    use it. */
 113 char
 114 recode_byte (const char *to, const char *from, char c)
 115 {
 116   char x;
 117   char *s = recode_string (to, from, &c, 1);
 118   x = s[0];
 119   free (s);
 120   return x;
 121 }
 122
 123 /* Similar to recode_string_pool, but allocates the returned value on the heap
 124    instead of in a pool.  It is the caller's responsibility to free the
 125    returned value. */
 126 char *
 127 recode_string (const char *to, const char *from,
 128                const char *text, int length)
 129 {
 130   return recode_string_pool (to, from, text, length, NULL);
 131 }
 132
 133 /* Returns the length, in bytes, of the string that a similar recode_string()
 134    call would return. */
 135 size_t
 136 recode_string_len (const char *to, const char *from,
 137                    const char *text, int length)
 138 {
 139   char *s = recode_string (to, from, text, length);
 140   size_t len = strlen (s);
 141   free (s);
 142   return len;
 143 }
 144
 145 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 146    at OP, and appends a null terminator to the output.
 147
 148    Returns the output length if successful, -1 if the output buffer is too
 149    small. */
 150 static ssize_t
 151 try_recode (iconv_t conv,
 152             const char *in, size_t inbytes,
 153             char *out_, size_t outbytes)
 154 {
 155   /* FIXME: Need to ensure that this char is valid in the target encoding */
 156   const char fallbackchar = '?';
 157   char *out = out_;
 158   int i;
 159
 160   /* Put the converter into the initial shift state, in case there was any
 161      state information left over from its last usage. */
 162   iconv (conv, NULL, 0, NULL, 0);
 163
 164   /* Do two rounds of iconv() calls:
 165
 166      - The first round does the bulk of the conversion using the
 167        caller-supplied input data..
 168
 169      - The second round flushes any leftover output.  This has a real effect
 170        with input encodings that use combining diacritics, e.g. without the
 171        second round the last character tends to gets dropped when converting
 172        from windows-1258 to other encodings.
 173   */
 174   for (i = 0; i < 2; i++)
 175     {
 176       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 177       size_t *inbytesp = i ? NULL : &inbytes;
 178
 179       while (iconv (conv, inp, inbytesp, &out, &outbytes) == -1)
 180         switch (errno)
 181           {
 182           case EINVAL:
 183             if (outbytes < 2)
 184               return -1;
 185             *out++ = fallbackchar;
 186             *out = '\0';
 187             return out - out_;
 188
 189           case EILSEQ:
 190             if (outbytes == 0)
 191               return -1;
 192             *out++ = fallbackchar;
 193             outbytes--;
 194             if (inp)
 195               {
 196                 in++;
 197                 inbytes--;
 198               }
 199             break;
 200
 201           case E2BIG:
 202             return -1;
 203
 204           default:
 205             /* should never happen */
 206             fprintf (stderr, "Character conversion error: %s\n",
 207                      strerror (errno));
 208             NOT_REACHED ();
 209             break;
 210           }
 211     }
 212
 213   if (outbytes == 0)
 214     return -1;
 215
 216   *out = '\0';
 217   return out - out_;
 218 }
 219
 220 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 221    dynamically allocated string in TO-encoding.  Any characters which cannot be
 222    converted will be represented by '?'.
 223
 224    LENGTH should be the length of the string or -1, if null terminated.
 225
 226    The returned string will be allocated on POOL.
 227
 228    This function's behaviour differs from that of g_convert_with_fallback
 229    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 230    the input string is not valid in the declared input encoding.  This function
 231    however perseveres even in the presence of badly encoded input. */
 232 char *
 233 recode_string_pool (const char *to, const char *from,
 234                     const char *text, int length, struct pool *pool)
 235 {
 236   struct substring out;
 237
 238   if ( text == NULL )
 239     return NULL;
 240
 241   if ( length == -1 )
 242      length = strlen (text);
 243
 244   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 245   return out.string;
 246 }
 247
 248 /* Returns the name of the encoding that should be used for file names.
 249
 250    This is meant to be the same encoding used by g_filename_from_uri() and
 251    g_filename_to_uri() in GLib. */
 252 static const char *
 253 filename_encoding (void)
 254 {
 255 #if defined _WIN32 || defined __WIN32__
 256   return "UTF-8";
 257 #else
 258   return locale_charset ();
 259 #endif
 260 }
 261
 262 static char *
 263 xconcat2 (const char *a, size_t a_len,
 264           const char *b, size_t b_len)
 265 {
 266   char *s = xmalloc (a_len + b_len + 1);
 267   memcpy (s, a, a_len);
 268   memcpy (s + a_len, b, b_len);
 269   s[a_len + b_len] = '\0';
 270   return s;
 271 }
 272
 273 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 274    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 275    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 276    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 277    HEAD and tries again, repeating as necessary until the concatenated result
 278    fits or until HEAD_LEN reaches 0.
 279
 280    [*] Actually this function drops grapheme clusters instead of characters, so
 281        that, e.g. a Unicode character followed by a combining accent character
 282        is either completely included or completely excluded from HEAD_LEN.  See
 283        UAX #29 at http://unicode.org/reports/tr29/ for more information on
 284        grapheme clusters.
 285
 286    A null ENCODING is treated as UTF-8.
 287
 288    Sometimes this function has to actually construct the concatenated string to
 289    measure its length.  When this happens, it sets *RESULTP to that
 290    null-terminated string, allocated with malloc(), for the caller to use if it
 291    needs it.  Otherwise, it sets *RESULTP to NULL.
 292
 293    Simple examples for encoding="UTF-8", max_len=6:
 294
 295        head="abc",  tail="xyz"     => 3
 296        head="abcd", tail="xyz"     => 3 ("d" dropped).
 297        head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 298        head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 299
 300    Examples for encoding="ISO-8859-1", max_len=6:
 301
 302        head="éèä",  tail="xyz"     => 6
 303          (each letter in head is only 1 byte in ISO-8859-1 even though they
 304           each take 2 bytes in UTF-8 encoding)
 305 */
 306 static size_t
 307 utf8_encoding_concat__ (const char *head, size_t head_len,
 308                         const char *tail, size_t tail_len,
 309                         const char *encoding, size_t max_len,
 310                         char **resultp)
 311 {
 312   *resultp = NULL;
 313   if (head_len == 0)
 314     return 0;
 315   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 316     {
 317       if (head_len + tail_len <= max_len)
 318         return head_len;
 319       else if (tail_len >= max_len)
 320         return 0;
 321       else
 322         {
 323           size_t copy_len;
 324           ucs4_t prev;
 325           size_t ofs;
 326           int mblen;
 327
 328           copy_len = 0;
 329           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 330                                 head_len);
 331                ofs <= max_len - tail_len;
 332                ofs += mblen)
 333             {
 334               ucs4_t next;
 335
 336               mblen = u8_mbtouc (&next,
 337                                  CHAR_CAST (const uint8_t *, head + ofs),
 338                                  head_len - ofs);
 339               if (uc_is_grapheme_break (prev, next))
 340                 copy_len = ofs;
 341
 342               prev = next;
 343             }
 344
 345           return copy_len;
 346         }
 347     }
 348   else
 349     {
 350       char *result;
 351
 352       result = (tail_len > 0
 353                 ? xconcat2 (head, head_len, tail, tail_len)
 354                 : CONST_CAST (char *, head));
 355       if (recode_string_len (encoding, "UTF-8", result,
 356                              head_len + tail_len) <= max_len)
 357         {
 358           *resultp = result != head ? result : NULL;
 359           return head_len;
 360         }
 361       else
 362         {
 363           bool correct_result = false;
 364           size_t copy_len;
 365           ucs4_t prev;
 366           size_t ofs;
 367           int mblen;
 368
 369           copy_len = 0;
 370           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 371                                 head_len);
 372                ofs <= head_len;
 373                ofs += mblen)
 374             {
 375               ucs4_t next;
 376
 377               mblen = u8_mbtouc (&next,
 378                                  CHAR_CAST (const uint8_t *, head + ofs),
 379                                  head_len - ofs);
 380               if (uc_is_grapheme_break (prev, next))
 381                 {
 382                   if (result != head)
 383                     {
 384                       memcpy (result, head, ofs);
 385                       memcpy (result + ofs, tail, tail_len);
 386                       result[ofs + tail_len] = '\0';
 387                     }
 388
 389                   if (recode_string_len (encoding, "UTF-8", result,
 390                                          ofs + tail_len) <= max_len)
 391                     {
 392                       correct_result = true;
 393                       copy_len = ofs;
 394                     }
 395                   else
 396                     correct_result = false;
 397                 }
 398
 399               prev = next;
 400             }
 401
 402           if (result != head)
 403             {
 404               if (correct_result)
 405                 *resultp = result;
 406               else
 407                 free (result);
 408             }
 409
 410           return copy_len;
 411         }
 412     }
 413 }
 414
 415 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 416    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 417    string are all encoded in UTF-8.  As many characters[*] from the beginning
 418    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 419    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 420    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 421
 422    [*] Actually this function drops grapheme clusters instead of characters, so
 423        that, e.g. a Unicode character followed by a combining accent character
 424        is either completely included or completely excluded from the returned
 425        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 426        information on grapheme clusters.
 427
 428    A null ENCODING is treated as UTF-8.
 429
 430    Simple examples for encoding="UTF-8", max_len=6:
 431
 432        head="abc",  tail="xyz"     => "abcxyz"
 433        head="abcd", tail="xyz"     => "abcxyz"
 434        head="abc",  tail="uvwxyz"  => "uvwxyz"
 435        head="abc",  tail="tuvwxyz" => "tuvwxyz"
 436
 437    Examples for encoding="ISO-8859-1", max_len=6:
 438
 439        head="éèä",  tail="xyz"    => "éèäxyz"
 440          (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 441           each take 2 bytes in UTF-8 encoding)
 442 */
 443 char *
 444 utf8_encoding_concat (const char *head, const char *tail,
 445                       const char *encoding, size_t max_len)
 446 {
 447   size_t tail_len = strlen (tail);
 448   size_t prefix_len;
 449   char *result;
 450
 451   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 452                                        encoding, max_len, &result);
 453   return (result != NULL
 454           ? result
 455           : xconcat2 (head, prefix_len, tail, tail_len));
 456 }
 457
 458 /* Returns the length, in bytes, of the string that would be returned by
 459    utf8_encoding_concat() if passed the same arguments, but the implementation
 460    is often more efficient. */
 461 size_t
 462 utf8_encoding_concat_len (const char *head, const char *tail,
 463                           const char *encoding, size_t max_len)
 464 {
 465   size_t tail_len = strlen (tail);
 466   size_t prefix_len;
 467   char *result;
 468
 469   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 470                                        encoding, max_len, &result);
 471   free (result);
 472   return prefix_len + tail_len;
 473 }
 474
 475 /* Returns an allocated, null-terminated string, owned by the caller,
 476    containing as many characters[*] from the beginning of S that would fit
 477    within MAX_LEN bytes if the returned string were to be re-encoded in
 478    ENCODING.  Both S and the returned string are encoded in UTF-8.
 479
 480    [*] Actually this function drops grapheme clusters instead of characters, so
 481        that, e.g. a Unicode character followed by a combining accent character
 482        is either completely included or completely excluded from the returned
 483        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 484        information on grapheme clusters.
 485
 486    A null ENCODING is treated as UTF-8.
 487 */
 488 char *
 489 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 490 {
 491   return utf8_encoding_concat (s, "", encoding, max_len);
 492 }
 493
 494 /* Returns the length, in bytes, of the string that would be returned by
 495    utf8_encoding_trunc() if passed the same arguments, but the implementation
 496    is often more efficient. */
 497 size_t
 498 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 499 {
 500   return utf8_encoding_concat_len (s, "", encoding, max_len);
 501 }
 502
 503 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 504    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 505    current locale. */
 506 char *
 507 utf8_to_filename (const char *filename)
 508 {
 509   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 510 }
 511
 512 /* Returns FILENAME converted from the filename encoding to UTF-8.
 513    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 514    current locale. */
 515 char *
 516 filename_to_utf8 (const char *filename)
 517 {
 518   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 519 }
 520
 521 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 522    dynamically allocated string in TO-encoding.  Any characters which cannot be
 523    converted will be represented by '?'.
 524
 525    The returned string will be null-terminated and allocated on POOL with
 526    pool_malloc().
 527
 528    This function's behaviour differs from that of g_convert_with_fallback
 529    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 530    the input string is not valid in the declared input encoding.  This function
 531    however perseveres even in the presence of badly encoded input. */
 532 struct substring
 533 recode_substring_pool (const char *to, const char *from,
 534                        struct substring text, struct pool *pool)
 535 {
 536   size_t outbufferlength;
 537   iconv_t conv ;
 538
 539   if (to == NULL)
 540     to = default_encoding;
 541
 542   if (from == NULL)
 543     from = default_encoding;
 544
 545   conv = create_iconv (to, from);
 546
 547   if ( (iconv_t) -1 == conv )
 548     {
 549       struct substring out;
 550
 551       out.string = pool_malloc (pool, text.length + 1);
 552       out.length = text.length;
 553       memcpy (out.string, text.string, text.length);
 554       out.string[out.length] = '\0';
 555       return out;
 556     }
 557
 558   for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
 559     if ( outbufferlength > text.length)
 560       {
 561         char *output = pool_malloc (pool, outbufferlength);
 562         ssize_t output_len = try_recode (conv, text.string, text.length,
 563                                          output, outbufferlength);
 564         if (output_len >= 0)
 565           return ss_buffer (output, output_len);
 566         pool_free (pool, output);
 567       }
 568
 569   NOT_REACHED ();
 570 }
 571
 572 void
 573 i18n_init (void)
 574 {
 575   setlocale (LC_ALL, "");
 576   bindtextdomain (PACKAGE, relocate(locale_dir));
 577   textdomain (PACKAGE);
 578
 579   assert (default_encoding == NULL);
 580   default_encoding = xstrdup (locale_charset ());
 581
 582   hmapx_init (&map);
 583 }
 584
 585 const char *
 586 get_default_encoding (void)
 587 {
 588   return default_encoding;
 589 }
 590
 591 void
 592 set_default_encoding (const char *enc)
 593 {
 594   free (default_encoding);
 595   default_encoding = xstrdup (enc);
 596 }
 597
 598
 599 /* Attempts to set the encoding from a locale name
 600    returns true if successfull.
 601    This function does not (should not!) alter the current locale.
 602 */
 603 bool
 604 set_encoding_from_locale (const char *loc)
 605 {
 606   bool ok = true;
 607   char *c_encoding;
 608   char *loc_encoding;
 609   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 610
 611   setlocale (LC_CTYPE, "C");
 612   c_encoding = xstrdup (locale_charset ());
 613
 614   setlocale (LC_CTYPE, loc);
 615   loc_encoding = xstrdup (locale_charset ());
 616
 617
 618   if ( 0 == strcmp (loc_encoding, c_encoding))
 619     {
 620       ok = false;
 621     }
 622
 623   setlocale (LC_CTYPE, tmp);
 624
 625   free (tmp);
 626
 627   if (ok)
 628     {
 629       free (default_encoding);
 630       default_encoding = loc_encoding;
 631     }
 632   else
 633     free (loc_encoding);
 634
 635   free (c_encoding);
 636
 637   return ok;
 638 }
 639
 640 void
 641 i18n_done (void)
 642 {
 643   struct hmapx_node *node;
 644   struct converter *cvtr;
 645
 646   HMAPX_FOR_EACH (cvtr, node, &map)
 647     {
 648       free (cvtr->tocode);
 649       free (cvtr->fromcode);
 650       if (cvtr->conv != (iconv_t) -1)
 651         iconv_close (cvtr->conv);
 652       free (cvtr);
 653     }
 654
 655   hmapx_destroy (&map);
 656
 657   free (default_encoding);
 658   default_encoding = NULL;
 659 }
 660
 661
 662
 663 bool
 664 valid_encoding (const char *enc)
 665 {
 666   iconv_t conv = iconv_open (UTF8, enc);
 667
 668   if ( conv == (iconv_t) -1)
 669     return false;
 670
 671   iconv_close (conv);
 672
 673   return true;
 674 }
 675
 676
 677 /* Return the system local's idea of the
 678    decimal seperator character */
 679 char
 680 get_system_decimal (void)
 681 {
 682   char radix_char;
 683
 684 #if HAVE_NL_LANGINFO
 685   radix_char = nl_langinfo (RADIXCHAR)[0];
 686 #else
 687   {
 688     char buf[10];
 689     snprintf (buf, sizeof buf, "%f", 2.5);
 690     radix_char = buf[1];
 691   }
 692 #endif
 693
 694   return radix_char;
 695 }
 696
 697 const char *
 698 uc_name (ucs4_t uc, char buffer[16])
 699 {
 700   if (uc >= 0x20 && uc < 0x7f)
 701     snprintf (buffer, 16, "`%c'", uc);
 702   else
 703     snprintf (buffer, 16, "U+%04X", uc);
 704   return buffer;
 705 }
 706 \f
 707 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 708
 709 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 710    with lowercase and uppercase letters treated as equal, starting from
 711    BASIS. */
 712 unsigned int
 713 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 714 {
 715   uint8_t folded_buf[2048];
 716   size_t folded_len = sizeof folded_buf;
 717   uint8_t *folded_s;
 718   unsigned int hash;
 719
 720   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 721                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 722   if (folded_s != NULL)
 723     {
 724       hash = hash_bytes (folded_s, folded_len, basis);
 725       if (folded_s != folded_buf)
 726         free (folded_s);
 727     }
 728   else
 729     {
 730       if (errno == ENOMEM)
 731         xalloc_die ();
 732       hash = hash_bytes (s, n, basis);
 733     }
 734
 735   return hash;
 736 }
 737
 738 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 739    uppercase letters treated as equal, starting from BASIS. */
 740 unsigned int
 741 utf8_hash_case_string (const char *s, unsigned int basis)
 742 {
 743   return utf8_hash_case_bytes (s, strlen (s), basis);
 744 }
 745
 746 /* Compares UTF-8 strings A and B case-insensitively.
 747    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 748 int
 749 utf8_strcasecmp (const char *a, const char *b)
 750 {
 751   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 752 }
 753
 754 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 755    case-insensitively.
 756    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 757 int
 758 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 759 {
 760   int result;
 761
 762   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 763                   CHAR_CAST (const uint8_t *, b), bn,
 764                   NULL, UNINORM_NFKD, &result))
 765     {
 766       if (errno == ENOMEM)
 767         xalloc_die ();
 768
 769       result = memcmp (a, b, MIN (an, bn));
 770       if (result == 0)
 771         result = an < bn ? -1 : an > bn;
 772     }
 773
 774   return result;
 775 }
 776
 777 static char *
 778 utf8_casemap (const char *s,
 779               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 780                              uint8_t *, size_t *))
 781 {
 782   char *result;
 783   size_t size;
 784
 785   result = CHAR_CAST (char *,
 786                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 787                          NULL, NULL, NULL, &size));
 788   if (result == NULL)
 789     {
 790       if (errno == ENOMEM)
 791         xalloc_die ();
 792
 793       result = xstrdup (s);
 794     }
 795   return result;
 796 }
 797
 798 char *
 799 utf8_to_upper (const char *s)
 800 {
 801   return utf8_casemap (s, u8_toupper);
 802 }
 803
 804 char *
 805 utf8_to_lower (const char *s)
 806 {
 807   return utf8_casemap (s, u8_tolower);
 808 }
 809 \f
 810 bool
 811 get_encoding_info (struct encoding_info *e, const char *name)
 812 {
 813   const struct substring in = SS_LITERAL_INITIALIZER (
 814     "\t\n\v\f\r "
 815     "!\"#$%&'()*+,-./0123456789:;<=>?@"
 816     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 817     "abcdefghijklmnopqrstuvwxyz{|}~");
 818
 819   struct substring out, cr, lf, space;
 820   bool ok;
 821
 822   memset (e, 0, sizeof *e);
 823
 824   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 825   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 826   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 827   ok = (cr.length >= 1
 828         && cr.length <= MAX_UNIT
 829         && cr.length == lf.length
 830         && cr.length == space.length);
 831   if (!ok)
 832     {
 833       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 834       ss_dealloc (&cr);
 835       ss_dealloc (&lf);
 836       ss_dealloc (&space);
 837       ss_alloc_substring (&cr, ss_cstr ("\r"));
 838       ss_alloc_substring (&lf, ss_cstr ("\n"));
 839       ss_alloc_substring (&space, ss_cstr (" "));
 840     }
 841
 842   e->unit = cr.length;
 843   memcpy (e->cr, cr.string, e->unit);
 844   memcpy (e->lf, lf.string, e->unit);
 845   memcpy (e->space, space.string, e->unit);
 846
 847   ss_dealloc (&cr);
 848   ss_dealloc (&lf);
 849   ss_dealloc (&space);
 850
 851   out = recode_substring_pool ("UTF-8", name, in, NULL);
 852   e->is_ascii_compatible = ss_equals (in, out);
 853   ss_dealloc (&out);
 854
 855   if (!e->is_ascii_compatible && e->unit == 1)
 856     {
 857       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 858       e->is_ebcdic_compatible = (out.length == 1
 859                                  && (uint8_t) out.string[0] == 0xc1);
 860       ss_dealloc (&out);
 861     }
 862   else
 863     e->is_ebcdic_compatible = false;
 864
 865   return ok;
 866 }
 867
 868 bool
 869 is_encoding_ascii_compatible (const char *encoding)
 870 {
 871   struct encoding_info e;
 872
 873   get_encoding_info (&e, encoding);
 874   return e.is_ascii_compatible;
 875 }
 876
 877 bool
 878 is_encoding_ebcdic_compatible (const char *encoding)
 879 {
 880   struct encoding_info e;
 881
 882   get_encoding_info (&e, encoding);
 883   return e.is_ebcdic_compatible;
 884 }
 885
 886 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 887    otherwise false. */
 888 bool
 889 is_encoding_supported (const char *encoding)
 890 {
 891   return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
 892           && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
 893 }
 894
 895 /* Returns true if E is the name of a UTF-8 encoding.
 896
 897    XXX Possibly we should test not E as a string but its properties via
 898    iconv. */
 899 bool
 900 is_encoding_utf8 (const char *e)
 901 {
 902   return ((e[0] == 'u' || e[0] == 'U')
 903           && (e[1] == 't' || e[1] == 'T')
 904           && (e[2] == 'f' || e[2] == 'F')
 905           && ((e[3] == '8' && e[4] == '\0')
 906               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 907 }
 908 \f
 909 static struct encoding_category *categories;
 910 static int n_categories;
 911
 912 static void SENTINEL (0)
 913 add_category (size_t *allocated_categories, const char *category, ...)
 914 {
 915   struct encoding_category *c;
 916   const char *encodings[16];
 917   va_list args;
 918   int i, n;
 919
 920   /* Count encoding arguments. */
 921   va_start (args, category);
 922   n = 0;
 923   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 924     {
 925       const char *encoding = encodings[n];
 926       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 927         n++;
 928     }
 929   assert (n < sizeof encodings / sizeof *encodings);
 930   va_end (args);
 931
 932   if (n == 0)
 933     return;
 934
 935   if (n_categories >= *allocated_categories)
 936     categories = x2nrealloc (categories,
 937                              allocated_categories, sizeof *categories);
 938
 939   c = &categories[n_categories++];
 940   c->category = category;
 941   c->encodings = xmalloc (n * sizeof *c->encodings);
 942   for (i = 0; i < n; i++)
 943     c->encodings[i] = encodings[i];
 944   c->n_encodings = n;
 945 }
 946
 947 static void
 948 init_encoding_categories (void)
 949 {
 950   static bool inited;
 951   size_t alloc;
 952
 953   if (inited)
 954     return;
 955   inited = true;
 956
 957   alloc = 0;
 958   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
 959                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
 960   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
 961                 NULL_SENTINEL);
 962   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
 963   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
 964                 "Windows-1257", NULL_SENTINEL);
 965   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
 966   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
 967                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
 968   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
 969                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
 970   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
 971                 "EUC-TW", NULL_SENTINEL);
 972   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
 973   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
 974                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
 975   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
 976   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
 977                 NULL_SENTINEL);
 978   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
 979   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
 980   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
 981   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
 982   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
 983                 NULL_SENTINEL);
 984   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
 985   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
 986   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
 987   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
 988                 NULL_SENTINEL);
 989   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
 990                 NULL_SENTINEL);
 991   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
 992   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
 993                 NULL_SENTINEL);
 994   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
 995   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
 996                 NULL_SENTINEL);
 997   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
 998                 NULL_SENTINEL);
 999   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1000                 "Windows-1258", NULL_SENTINEL);
1001   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1002                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1003 }
1004
1005 /* Returns an array of "struct encoding_category" that contains only the
1006    categories and encodings that the system supports. */
1007 struct encoding_category *
1008 get_encoding_categories (void)
1009 {
1010   init_encoding_categories ();
1011   return categories;
1012 }
1013
1014 /* Returns the number of elements in the array returned by
1015    get_encoding_categories().  */
1016 size_t
1017 get_n_encoding_categories (void)
1018 {
1019   init_encoding_categories ();
1020   return n_categories;
1021 }