pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <locale.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <unicase.h>
  30 #include <unigbrk.h>
  31
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/compiler.h"
  34 #include "libpspp/hmapx.h"
  35 #include "libpspp/hash-functions.h"
  36 #include "libpspp/pool.h"
  37 #include "libpspp/str.h"
  38 #include "libpspp/version.h"
  39
  40 #include "gl/c-ctype.h"
  41 #include "gl/c-strcase.h"
  42 #include "gl/localcharset.h"
  43 #include <gl/localename.h>
  44 #include "gl/minmax.h"
  45 #include "gl/xalloc.h"
  46 #include "gl/relocatable.h"
  47 #include "gl/xstrndup.h"
  48
  49 #include "gettext.h"
  50 #define _(msgid) gettext (msgid)
  51
  52 struct converter
  53 {
  54   char *tocode;
  55   char *fromcode;
  56   iconv_t conv;
  57   int null_char_width;
  58 };
  59
  60 static char *default_encoding;
  61 static struct hmapx map;
  62
  63 /* A wrapper around iconv_open */
  64 static struct converter *
  65 create_iconv (const char* tocode, const char* fromcode)
  66 {
  67   size_t hash;
  68   struct hmapx_node *node;
  69   struct converter *converter;
  70   assert (fromcode);
  71
  72   hash = hash_string (tocode, hash_string (fromcode, 0));
  73   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  74     {
  75       if (!converter)
  76         return NULL;
  77
  78       if (!strcmp (tocode, converter->tocode)
  79           && !strcmp (fromcode, converter->fromcode))
  80         return converter;
  81     }
  82
  83   converter = xmalloc (sizeof *converter);
  84   converter->tocode = xstrdup (tocode);
  85   converter->fromcode = xstrdup (fromcode);
  86   converter->conv = iconv_open (tocode, fromcode);
  87   int error = converter->conv == (iconv_t) ~0 ? errno : 0;
  88   /* I don't think it's safe to translate this string or to use messaging
  89      as the converters have not yet been set up */
  90   if (error && strcmp (tocode, fromcode))
  91     {
  92       fprintf (stderr,
  93                "Warning: "
  94                "cannot create a converter for `%s' to `%s': %s\n",
  95                fromcode, tocode, strerror (error));
  96
  97       free (converter->tocode);
  98       free (converter->fromcode);
  99       free (converter);
 100
 101       hmapx_insert (&map, NULL, hash);
 102       return NULL;
 103     }
 104
 105   /* Find out how many bytes there are in a null char in the target
 106      encoding */
 107   iconv_t bconv = iconv_open (tocode, "ASCII");
 108   if (bconv != (iconv_t) -1)
 109     {
 110       ICONV_CONST char inbuf[1] = "";
 111       ICONV_CONST char *inptr = inbuf;
 112       size_t inbytes = sizeof inbuf;
 113
 114       char outbuf[8];
 115       char *outptr = outbuf;
 116       size_t outbytes = sizeof outbuf;
 117       if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
 118         converter->null_char_width = outptr - outbuf;
 119       iconv_close (bconv);
 120     }
 121
 122   hmapx_insert (&map, converter, hash);
 123
 124   return converter;
 125 }
 126
 127
 128 /* Converts the single byte C from encoding FROM to TO, returning the first
 129    byte of the result.
 130
 131    This function probably shouldn't be used at all, but some code still does
 132    use it. */
 133 char
 134 recode_byte (const char *to, const char *from, char c)
 135 {
 136   char x;
 137   char *s = recode_string (to, from, &c, 1);
 138   x = s[0];
 139   free (s);
 140   return x;
 141 }
 142
 143 /* Similar to recode_string_pool, but allocates the returned value on the heap
 144    instead of in a pool.  It is the caller's responsibility to free the
 145    returned value. */
 146 char *
 147 recode_string (const char *to, const char *from,
 148                const char *text, int length)
 149 {
 150   return recode_string_pool (to, from, text, length, NULL);
 151 }
 152
 153 /* Returns the length, in bytes, of the string that a similar recode_string()
 154    call would return. */
 155 size_t
 156 recode_string_len (const char *to, const char *from,
 157                    const char *text, int length)
 158 {
 159   char *s = recode_string (to, from, text, length);
 160   size_t len = strlen (s);
 161   free (s);
 162   return len;
 163 }
 164
 165 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 166    at OP, and appends a null terminator to the output.
 167
 168    Returns the output length if successful, -1 if the output buffer is too
 169    small. */
 170 static ssize_t
 171 try_recode (struct converter *cvtr, char fallbackchar,
 172             const char *in, size_t inbytes,
 173             char *out_, size_t outbytes)
 174 {
 175   char *out = out_;
 176   int i, j;
 177
 178   int null_bytes = cvtr->null_char_width;
 179
 180   /* Put the converter into the initial shift state, in case there was any
 181      state information left over from its last usage. */
 182   iconv (cvtr->conv, NULL, 0, NULL, 0);
 183
 184   /* Do two rounds of iconv() calls:
 185
 186      - The first round does the bulk of the conversion using the
 187      caller-supplied input data..
 188
 189      - The second round flushes any leftover output.  This has a real effect
 190      with input encodings that use combining diacritics, e.g. without the
 191      second round the last character tends to gets dropped when converting
 192      from windows-1258 to other encodings.
 193   */
 194   for (i = 0; i < 2; i++)
 195     {
 196       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 197       size_t *inbytesp = i ? NULL : &inbytes;
 198
 199       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
 200         switch (errno)
 201           {
 202           case EINVAL:
 203             if (outbytes < null_bytes + 1)
 204               return -E2BIG;
 205             if (!fallbackchar)
 206               return -EINVAL;
 207             *out++ = fallbackchar;
 208             for (j = 0 ; j < null_bytes ; ++j)
 209               *out++ = '\0';
 210             return out - 1 - out_;
 211
 212           case EILSEQ:
 213             if (outbytes == 0)
 214               return -E2BIG;
 215             if (!fallbackchar)
 216               return -EILSEQ;
 217             *out++ = fallbackchar;
 218             outbytes--;
 219             if (inp)
 220               {
 221                 in++;
 222                 inbytes--;
 223               }
 224             break;
 225
 226           case E2BIG:
 227             return -E2BIG;
 228
 229           default:
 230             /* should never happen */
 231             fprintf (stderr, "Character conversion error: %s\n",
 232                      strerror (errno));
 233             NOT_REACHED ();
 234             break;
 235           }
 236     }
 237
 238   if (outbytes <= null_bytes - 1)
 239     return -E2BIG;
 240
 241   for (i = 0 ; i < null_bytes ; ++i)
 242     *out++ = '\0';
 243
 244   return out - 1 - out_;
 245 }
 246
 247 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 248    dynamically allocated string in TO-encoding.  Any characters which cannot be
 249    converted will be represented by '?'.
 250
 251    LENGTH should be the length of the string or -1, if null terminated.
 252
 253    The returned string will be allocated on POOL.
 254
 255    This function's behaviour differs from that of g_convert_with_fallback
 256    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 257    the input string is not valid in the declared input encoding.  This function
 258    however perseveres even in the presence of badly encoded input. */
 259 char *
 260 recode_string_pool (const char *to, const char *from,
 261                     const char *text, int length, struct pool *pool)
 262 {
 263   struct substring out;
 264
 265   if (text == NULL)
 266     return NULL;
 267
 268   if (length == -1)
 269     length = strlen (text);
 270
 271   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 272   return out.string;
 273 }
 274
 275 /* Returns the name of the encoding that should be used for file names.
 276
 277    This is meant to be the same encoding used by g_filename_from_uri() and
 278    g_filename_to_uri() in GLib. */
 279 static const char *
 280 filename_encoding (void)
 281 {
 282 #if defined _WIN32 || defined __WIN32__
 283   return "UTF-8";
 284 #else
 285   return locale_charset ();
 286 #endif
 287 }
 288
 289 static char *
 290 xconcat2 (const char *a, size_t a_len,
 291           const char *b, size_t b_len)
 292 {
 293   char *s = xmalloc (a_len + b_len + 1);
 294   memcpy (s, a, a_len);
 295   memcpy (s + a_len, b, b_len);
 296   s[a_len + b_len] = '\0';
 297   return s;
 298 }
 299
 300 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 301    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 302    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 303    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 304    HEAD and tries again, repeating as necessary until the concatenated result
 305    fits or until HEAD_LEN reaches 0.
 306
 307    [*] Actually this function drops grapheme clusters instead of characters, so
 308    that, e.g. a Unicode character followed by a combining accent character
 309    is either completely included or completely excluded from HEAD_LEN.  See
 310    UAX #29 at http://unicode.org/reports/tr29/ for more information on
 311    grapheme clusters.
 312
 313    A null ENCODING is treated as UTF-8.
 314
 315    Sometimes this function has to actually construct the concatenated string to
 316    measure its length.  When this happens, it sets *RESULTP to that
 317    null-terminated string, allocated with malloc(), for the caller to use if it
 318    needs it.  Otherwise, it sets *RESULTP to NULL.
 319
 320    Simple examples for encoding="UTF-8", max_len=6:
 321
 322    head="abc",  tail="xyz"     => 3
 323    head="abcd", tail="xyz"     => 3 ("d" dropped).
 324    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 325    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 326
 327    Examples for encoding="ISO-8859-1", max_len=6:
 328
 329    head="éèä",  tail="xyz"     => 6
 330    (each letter in head is only 1 byte in ISO-8859-1 even though they
 331    each take 2 bytes in UTF-8 encoding)
 332 */
 333 static size_t
 334 utf8_encoding_concat__ (const char *head, size_t head_len,
 335                         const char *tail, size_t tail_len,
 336                         const char *encoding, size_t max_len,
 337                         char **resultp)
 338 {
 339   *resultp = NULL;
 340   if (head_len == 0)
 341     return 0;
 342   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 343     {
 344       if (head_len + tail_len <= max_len)
 345         return head_len;
 346       else if (tail_len >= max_len)
 347         return 0;
 348       else
 349         {
 350           size_t copy_len;
 351           ucs4_t prev;
 352           size_t ofs;
 353           int mblen;
 354
 355           copy_len = 0;
 356           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 357                                 head_len);
 358                ofs <= max_len - tail_len;
 359                ofs += mblen)
 360             {
 361               ucs4_t next;
 362
 363               mblen = u8_mbtouc (&next,
 364                                  CHAR_CAST (const uint8_t *, head + ofs),
 365                                  head_len - ofs);
 366               if (uc_is_grapheme_break (prev, next))
 367                 copy_len = ofs;
 368
 369               prev = next;
 370             }
 371
 372           return copy_len;
 373         }
 374     }
 375   else
 376     {
 377       char *result;
 378
 379       result = (tail_len > 0
 380                 ? xconcat2 (head, head_len, tail, tail_len)
 381                 : CONST_CAST (char *, head));
 382       if (recode_string_len (encoding, "UTF-8", result,
 383                              head_len + tail_len) <= max_len)
 384         {
 385           *resultp = result != head ? result : NULL;
 386           return head_len;
 387         }
 388       else
 389         {
 390           bool correct_result = false;
 391           size_t copy_len;
 392           ucs4_t prev;
 393           size_t ofs;
 394           int mblen;
 395
 396           copy_len = 0;
 397           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 398                                 head_len);
 399                ofs <= head_len;
 400                ofs += mblen)
 401             {
 402               ucs4_t next;
 403
 404               mblen = u8_mbtouc (&next,
 405                                  CHAR_CAST (const uint8_t *, head + ofs),
 406                                  head_len - ofs);
 407               if (uc_is_grapheme_break (prev, next))
 408                 {
 409                   if (result != head)
 410                     {
 411                       memcpy (result, head, ofs);
 412                       memcpy (result + ofs, tail, tail_len);
 413                       result[ofs + tail_len] = '\0';
 414                     }
 415
 416                   if (recode_string_len (encoding, "UTF-8", result,
 417                                          ofs + tail_len) <= max_len)
 418                     {
 419                       correct_result = true;
 420                       copy_len = ofs;
 421                     }
 422                   else
 423                     correct_result = false;
 424                 }
 425
 426               prev = next;
 427             }
 428
 429           if (result != head)
 430             {
 431               if (correct_result)
 432                 *resultp = result;
 433               else
 434                 free (result);
 435             }
 436
 437           return copy_len;
 438         }
 439     }
 440 }
 441
 442 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 443    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 444    string are all encoded in UTF-8.  As many characters[*] from the beginning
 445    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 446    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 447    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 448
 449    [*] Actually this function drops grapheme clusters instead of characters, so
 450    that, e.g. a Unicode character followed by a combining accent character
 451    is either completely included or completely excluded from the returned
 452    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 453    information on grapheme clusters.
 454
 455    A null ENCODING is treated as UTF-8.
 456
 457    Simple examples for encoding="UTF-8", max_len=6:
 458
 459    head="abc",  tail="xyz"     => "abcxyz"
 460    head="abcd", tail="xyz"     => "abcxyz"
 461    head="abc",  tail="uvwxyz"  => "uvwxyz"
 462    head="abc",  tail="tuvwxyz" => "tuvwxyz"
 463
 464    Examples for encoding="ISO-8859-1", max_len=6:
 465
 466    head="éèä",  tail="xyz"    => "éèäxyz"
 467    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 468    each take 2 bytes in UTF-8 encoding)
 469 */
 470 char *
 471 utf8_encoding_concat (const char *head, const char *tail,
 472                       const char *encoding, size_t max_len)
 473 {
 474   size_t tail_len = strlen (tail);
 475   size_t prefix_len;
 476   char *result;
 477
 478   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 479                                        encoding, max_len, &result);
 480   return (result != NULL
 481           ? result
 482           : xconcat2 (head, prefix_len, tail, tail_len));
 483 }
 484
 485 /* Returns the length, in bytes, of the string that would be returned by
 486    utf8_encoding_concat() if passed the same arguments, but the implementation
 487    is often more efficient. */
 488 size_t
 489 utf8_encoding_concat_len (const char *head, const char *tail,
 490                           const char *encoding, size_t max_len)
 491 {
 492   size_t tail_len = strlen (tail);
 493   size_t prefix_len;
 494   char *result;
 495
 496   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 497                                        encoding, max_len, &result);
 498   free (result);
 499   return prefix_len + tail_len;
 500 }
 501
 502 /* Returns an allocated, null-terminated string, owned by the caller,
 503    containing as many characters[*] from the beginning of S that would fit
 504    within MAX_LEN bytes if the returned string were to be re-encoded in
 505    ENCODING.  Both S and the returned string are encoded in UTF-8.
 506
 507    [*] Actually this function drops grapheme clusters instead of characters, so
 508    that, e.g. a Unicode character followed by a combining accent character
 509    is either completely included or completely excluded from the returned
 510    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 511    information on grapheme clusters.
 512
 513    A null ENCODING is treated as UTF-8.
 514 */
 515 char *
 516 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 517 {
 518   return utf8_encoding_concat (s, "", encoding, max_len);
 519 }
 520
 521 /* Returns the length, in bytes, of the string that would be returned by
 522    utf8_encoding_trunc() if passed the same arguments, but the implementation
 523    is often more efficient. */
 524 size_t
 525 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 526 {
 527   return utf8_encoding_concat_len (s, "", encoding, max_len);
 528 }
 529
 530 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 531    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 532    current locale. */
 533 char *
 534 utf8_to_filename (const char *filename)
 535 {
 536   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 537 }
 538
 539 /* Returns FILENAME converted from the filename encoding to UTF-8.
 540    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 541    current locale. */
 542 char *
 543 filename_to_utf8 (const char *filename)
 544 {
 545   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 546 }
 547
 548 static int
 549 recode_substring_pool__ (const char *to, const char *from,
 550                          struct substring text, char fallbackchar,
 551                          struct pool *pool, struct substring *out)
 552 {
 553   size_t bufsize;
 554   struct converter *conv;
 555
 556   if (to == NULL)
 557     to = default_encoding;
 558
 559   if (from == NULL)
 560     from = default_encoding;
 561
 562   conv = create_iconv (to, from);
 563
 564   if (NULL == conv)
 565     {
 566       if (fallbackchar)
 567         {
 568           out->string = pool_malloc (pool, text.length + 1);
 569           out->length = text.length;
 570           memcpy (out->string, text.string, text.length);
 571           out->string[out->length] = '\0';
 572           return 0;
 573         }
 574       else
 575         return EPROTO;
 576     }
 577
 578   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 579     {
 580       char *output = pool_malloc (pool, bufsize);
 581       ssize_t retval;
 582
 583       retval = try_recode (conv, fallbackchar, text.string, text.length,
 584                            output, bufsize);
 585       if (retval >= 0)
 586         {
 587           *out = ss_buffer (output, retval);
 588           return 0;
 589         }
 590       pool_free (pool, output);
 591
 592       if (retval != -E2BIG)
 593         return -retval;
 594     }
 595
 596   NOT_REACHED ();
 597 }
 598
 599 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 600    dynamically allocated string in TO-encoding.  Any characters which cannot be
 601    converted will be represented by '?'.
 602
 603    The returned string will be null-terminated and allocated on POOL with
 604    pool_malloc().
 605
 606    This function's behaviour differs from that of g_convert_with_fallback
 607    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 608    the input string is not valid in the declared input encoding.  This function
 609    however perseveres even in the presence of badly encoded input. */
 610 struct substring
 611 recode_substring_pool (const char *to, const char *from,
 612                        struct substring text, struct pool *pool)
 613 {
 614   struct substring out;
 615
 616   recode_substring_pool__ (to, from, text, '?', pool, &out);
 617   return out;
 618 }
 619
 620 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 621    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 622    converted null-terminated string, allocated from POOL with pool_malloc(), is
 623    stored in *OUT.  On failure, returns a positive errno value.
 624
 625    The function fails with an error if any part of the input string is not
 626    valid in the declared input encoding. */
 627 int
 628 recode_pedantically (const char *to, const char *from,
 629                      struct substring text, struct pool *pool,
 630                      struct substring *out)
 631 {
 632   int error;
 633
 634   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 635   if (error)
 636     *out = ss_empty ();
 637   return error;
 638 }
 639 \f
 640 void
 641 i18n_init (void)
 642 {
 643   setlocale (LC_ALL, "");
 644   bindtextdomain (PACKAGE, relocate(locale_dir));
 645   textdomain (PACKAGE);
 646
 647   assert (default_encoding == NULL);
 648   default_encoding = xstrdup (locale_charset ());
 649
 650   hmapx_init (&map);
 651 }
 652
 653 const char *
 654 get_default_encoding (void)
 655 {
 656   return default_encoding;
 657 }
 658
 659 void
 660 set_default_encoding (const char *enc)
 661 {
 662   free (default_encoding);
 663   default_encoding = xstrdup (enc);
 664 }
 665
 666 /* Return the ISO two letter code for the current LC_MESSAGES
 667    locale category.  */
 668 char *
 669 get_language (void)
 670 {
 671   const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
 672   if (0 == strcmp (localename, "C"))
 673     return NULL;
 674   char *ln = xstrdup (localename);
 675   char *end = strchr (ln, '_');
 676   if (end)
 677     *end = '\0';
 678   return ln;
 679 }
 680
 681
 682 /* Attempts to set the encoding from a locale name
 683    returns true if successful.
 684    This function does not (should not!) alter the current locale.
 685 */
 686 bool
 687 set_encoding_from_locale (const char *loc)
 688 {
 689   bool ok = true;
 690   char *c_encoding;
 691   char *loc_encoding;
 692   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 693
 694   setlocale (LC_CTYPE, "C");
 695   c_encoding = xstrdup (locale_charset ());
 696
 697   setlocale (LC_CTYPE, loc);
 698   loc_encoding = xstrdup (locale_charset ());
 699
 700
 701   if (0 == strcmp (loc_encoding, c_encoding))
 702     {
 703       ok = false;
 704     }
 705
 706   setlocale (LC_CTYPE, tmp);
 707
 708   free (tmp);
 709
 710   if (ok)
 711     {
 712       free (default_encoding);
 713       default_encoding = loc_encoding;
 714     }
 715   else
 716     free (loc_encoding);
 717
 718   free (c_encoding);
 719
 720   return ok;
 721 }
 722
 723 void
 724 i18n_done (void)
 725 {
 726   struct hmapx_node *node;
 727   struct converter *cvtr;
 728
 729   HMAPX_FOR_EACH (cvtr, node, &map)
 730     {
 731       if (cvtr == NULL)
 732         continue;
 733       free (cvtr->tocode);
 734       free (cvtr->fromcode);
 735       if (cvtr->conv != (iconv_t) -1)
 736         iconv_close (cvtr->conv);
 737       free (cvtr);
 738     }
 739
 740   hmapx_destroy (&map);
 741
 742   free (default_encoding);
 743   default_encoding = NULL;
 744 }
 745
 746
 747
 748 bool
 749 valid_encoding (const char *enc)
 750 {
 751   iconv_t conv = iconv_open (UTF8, enc);
 752
 753   if (conv == (iconv_t) -1)
 754     return false;
 755
 756   iconv_close (conv);
 757
 758   return true;
 759 }
 760
 761
 762 /* Return the system local's idea of the
 763    decimal separator character */
 764 char
 765 get_system_decimal (void)
 766 {
 767   char radix_char;
 768
 769 #if HAVE_NL_LANGINFO
 770   radix_char = nl_langinfo (RADIXCHAR)[0];
 771 #else
 772   {
 773     char buf[10];
 774     snprintf (buf, sizeof buf, "%f", 2.5);
 775     radix_char = buf[1];
 776   }
 777 #endif
 778
 779   return radix_char;
 780 }
 781
 782 const char *
 783 uc_name (ucs4_t uc, char buffer[16])
 784 {
 785   if (uc >= 0x20 && uc < 0x7f)
 786     snprintf (buffer, 16, "`%c'", uc);
 787   else
 788     snprintf (buffer, 16, "U+%04X", uc);
 789   return buffer;
 790 }
 791 \f
 792 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 793
 794 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 795    with lowercase and uppercase letters treated as equal, starting from
 796    BASIS. */
 797 unsigned int
 798 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 799 {
 800   uint8_t folded_buf[2048];
 801   size_t folded_len = sizeof folded_buf;
 802   uint8_t *folded_s;
 803   unsigned int hash;
 804
 805   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 806                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 807   if (folded_s != NULL)
 808     {
 809       hash = hash_bytes (folded_s, folded_len, basis);
 810       if (folded_s != folded_buf)
 811         free (folded_s);
 812     }
 813   else
 814     {
 815       if (errno == ENOMEM)
 816         xalloc_die ();
 817       hash = hash_bytes (s, n, basis);
 818     }
 819
 820   return hash;
 821 }
 822
 823 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 824    uppercase letters treated as equal, starting from BASIS. */
 825 unsigned int
 826 utf8_hash_case_string (const char *s, unsigned int basis)
 827 {
 828   return utf8_hash_case_bytes (s, strlen (s), basis);
 829 }
 830
 831 /* Compares UTF-8 strings A and B case-insensitively.
 832    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 833 int
 834 utf8_strcasecmp (const char *a, const char *b)
 835 {
 836   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 837 }
 838
 839 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 840    case-insensitively.
 841    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 842 int
 843 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 844 {
 845   int result;
 846
 847   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 848                   CHAR_CAST (const uint8_t *, b), bn,
 849                   NULL, UNINORM_NFKD, &result))
 850     {
 851       if (errno == ENOMEM)
 852         xalloc_die ();
 853
 854       result = memcmp (a, b, MIN (an, bn));
 855       if (result == 0)
 856         result = an < bn ? -1 : an > bn;
 857     }
 858
 859   return result;
 860 }
 861
 862 static bool
 863 is_all_digits (const uint8_t *s, size_t len)
 864 {
 865   for (size_t i = 0; i < len; i++)
 866     if (!c_isdigit (s[i]))
 867       return false;
 868   return true;
 869 }
 870
 871 /* Compares UTF-8 strings A and B case-insensitively.  If the strings end in a
 872    number, then they are compared numerically.  Returns a negative value if A <
 873    B, zero if A == B, positive if A > B. */
 874 int
 875 utf8_strverscasecmp (const char *a, const char *b)
 876 {
 877   /* Normalize A. */
 878   uint8_t a_stub[64];
 879   size_t a_len = sizeof a_stub;
 880   uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
 881                                  UNINORM_NFKD, a_stub, &a_len);
 882
 883   /* Normalize B. */
 884   uint8_t b_stub[64];
 885   size_t b_len = sizeof b_stub;
 886   uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
 887                                  UNINORM_NFKD, b_stub, &b_len);
 888
 889   int result;
 890   if (!a_norm || !b_norm)
 891     {
 892       result = strcmp (a, b);
 893       goto exit;
 894     }
 895
 896   size_t len = MIN (a_len, b_len);
 897   for (size_t i = 0; i < len; i++)
 898     if (a_norm[i] != b_norm[i])
 899       {
 900         /* If both strings end in digits, compare them numerically. */
 901         if (is_all_digits (&a_norm[i], a_len - i)
 902             && is_all_digits (&b_norm[i], b_len - i))
 903           {
 904             /* Start by stripping leading zeros, since those don't matter for
 905                numerical comparison. */
 906             size_t ap, bp;
 907             for (ap = i; ap < a_len; ap++)
 908               if (a_norm[ap] != '0')
 909                 break;
 910             for (bp = i; bp < b_len; bp++)
 911               if (b_norm[bp] != '0')
 912                 break;
 913
 914             /* The number with more digits, if there is one, is larger. */
 915             size_t a_digits = a_len - ap;
 916             size_t b_digits = b_len - bp;
 917             if (a_digits != b_digits)
 918               result = a_digits > b_digits ? 1 : -1;
 919             else
 920               result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
 921           }
 922         else
 923           result = a_norm[i] > b_norm[i] ? 1 : -1;
 924         goto exit;
 925       }
 926   result = a_len < b_len ? -1 : a_len > b_len;
 927
 928 exit:
 929   if (a_norm != a_stub)
 930     free (a_norm);
 931   if (b_norm != b_stub)
 932     free (b_norm);
 933   return result;
 934 }
 935
 936 static char *
 937 utf8_casemap (const char *s,
 938               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 939                              uint8_t *, size_t *))
 940 {
 941   char *result;
 942   size_t size;
 943
 944   result = CHAR_CAST (char *,
 945                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 946                          NULL, NULL, NULL, &size));
 947   if (result == NULL)
 948     {
 949       if (errno == ENOMEM)
 950         xalloc_die ();
 951
 952       result = xstrdup (s);
 953     }
 954   return result;
 955 }
 956
 957 char *
 958 utf8_to_upper (const char *s)
 959 {
 960   return utf8_casemap (s, u8_toupper);
 961 }
 962
 963 char *
 964 utf8_to_lower (const char *s)
 965 {
 966   return utf8_casemap (s, u8_tolower);
 967 }
 968
 969 char *
 970 utf8_to_title (const char *s)
 971 {
 972   return utf8_casemap (s, u8_totitle);
 973 }
 974 \f
 975 bool
 976 get_encoding_info (struct encoding_info *e, const char *name)
 977 {
 978   const struct substring in = SS_LITERAL_INITIALIZER (
 979                                                       "\t\n\v\f\r "
 980                                                       "!\"#$%&'()*+,-./0123456789:;<=>?@"
 981                                                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 982                                                       "abcdefghijklmnopqrstuvwxyz{|}~");
 983
 984   struct substring out, cr, lf, space;
 985   bool ok;
 986
 987   memset (e, 0, sizeof *e);
 988
 989   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 990   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 991   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 992   ok = (cr.length >= 1
 993         && cr.length <= MAX_UNIT
 994         && cr.length == lf.length
 995         && cr.length == space.length);
 996   if (!ok)
 997     {
 998       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 999       ss_dealloc (&cr);
1000       ss_dealloc (&lf);
1001       ss_dealloc (&space);
1002       ss_alloc_substring (&cr, ss_cstr ("\r"));
1003       ss_alloc_substring (&lf, ss_cstr ("\n"));
1004       ss_alloc_substring (&space, ss_cstr (" "));
1005     }
1006
1007   e->unit = cr.length;
1008   memcpy (e->cr, cr.string, e->unit);
1009   memcpy (e->lf, lf.string, e->unit);
1010   memcpy (e->space, space.string, e->unit);
1011
1012   ss_dealloc (&cr);
1013   ss_dealloc (&lf);
1014   ss_dealloc (&space);
1015
1016   out = recode_substring_pool ("UTF-8", name, in, NULL);
1017   e->is_ascii_compatible = ss_equals (in, out);
1018   ss_dealloc (&out);
1019
1020   if (!e->is_ascii_compatible && e->unit == 1)
1021     {
1022       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1023       e->is_ebcdic_compatible = (out.length == 1
1024                                  && (uint8_t) out.string[0] == 0xc1);
1025       ss_dealloc (&out);
1026     }
1027   else
1028     e->is_ebcdic_compatible = false;
1029
1030   return ok;
1031 }
1032
1033 bool
1034 is_encoding_ascii_compatible (const char *encoding)
1035 {
1036   struct encoding_info e;
1037
1038   get_encoding_info (&e, encoding);
1039   return e.is_ascii_compatible;
1040 }
1041
1042 bool
1043 is_encoding_ebcdic_compatible (const char *encoding)
1044 {
1045   struct encoding_info e;
1046
1047   get_encoding_info (&e, encoding);
1048   return e.is_ebcdic_compatible;
1049 }
1050
1051 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1052    otherwise false. */
1053 bool
1054 is_encoding_supported (const char *encoding)
1055 {
1056   return (create_iconv ("UTF-8", encoding)
1057           && create_iconv (encoding, "UTF-8"));
1058 }
1059
1060 /* Returns true if E is the name of a UTF-8 encoding.
1061
1062    XXX Possibly we should test not E as a string but its properties via
1063    iconv. */
1064 bool
1065 is_encoding_utf8 (const char *e)
1066 {
1067   return ((e[0] == 'u' || e[0] == 'U')
1068           && (e[1] == 't' || e[1] == 'T')
1069           && (e[2] == 'f' || e[2] == 'F')
1070           && ((e[3] == '8' && e[4] == '\0')
1071               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1072 }
1073 \f
1074 static struct encoding_category *categories;
1075 static int n_categories;
1076
1077 static void SENTINEL (0)
1078   add_category (size_t *allocated_categories, const char *category, ...)
1079 {
1080   struct encoding_category *c;
1081   const char *encodings[16];
1082   va_list args;
1083   int i, n;
1084
1085   /* Count encoding arguments. */
1086   va_start (args, category);
1087   n = 0;
1088   while ((encodings[n] = va_arg (args, const char *)) != NULL)
1089     {
1090       const char *encoding = encodings[n];
1091       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1092         n++;
1093     }
1094   assert (n < sizeof encodings / sizeof *encodings);
1095   va_end (args);
1096
1097   if (n == 0)
1098     return;
1099
1100   if (n_categories >= *allocated_categories)
1101     categories = x2nrealloc (categories,
1102                              allocated_categories, sizeof *categories);
1103
1104   c = &categories[n_categories++];
1105   c->category = category;
1106   c->encodings = xmalloc (n * sizeof *c->encodings);
1107   for (i = 0; i < n; i++)
1108     c->encodings[i] = encodings[i];
1109   c->n_encodings = n;
1110 }
1111
1112 static void
1113 init_encoding_categories (void)
1114 {
1115   static bool inited;
1116   size_t alloc;
1117
1118   if (inited)
1119     return;
1120   inited = true;
1121
1122   alloc = 0;
1123   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1124                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1125   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1126                 NULL_SENTINEL);
1127   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1128   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1129                 "Windows-1257", NULL_SENTINEL);
1130   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1131   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1132                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1133   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1134                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1135   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1136                 "EUC-TW", NULL_SENTINEL);
1137   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1138   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1139                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1140   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1141   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1142                 NULL_SENTINEL);
1143   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1144   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1145   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1146   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1147   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1148                 NULL_SENTINEL);
1149   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1150   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1151   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1152   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1153                 NULL_SENTINEL);
1154   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1155                 NULL_SENTINEL);
1156   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1157   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1158                 NULL_SENTINEL);
1159   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1160   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1161                 NULL_SENTINEL);
1162   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1163                 NULL_SENTINEL);
1164   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1165                 "Windows-1258", NULL_SENTINEL);
1166   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1167                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1168 }
1169
1170 /* Returns an array of "struct encoding_category" that contains only the
1171    categories and encodings that the system supports. */
1172 struct encoding_category *
1173 get_encoding_categories (void)
1174 {
1175   init_encoding_categories ();
1176   return categories;
1177 }
1178
1179 /* Returns the number of elements in the array returned by
1180    get_encoding_categories().  */
1181 size_t
1182 get_n_encoding_categories (void)
1183 {
1184   init_encoding_categories ();
1185   return n_categories;
1186 }