pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <locale.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <unicase.h>
  30 #include <unigbrk.h>
  31
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/compiler.h"
  34 #include "libpspp/hmapx.h"
  35 #include "libpspp/hash-functions.h"
  36 #include "libpspp/pool.h"
  37 #include "libpspp/str.h"
  38 #include "libpspp/version.h"
  39
  40 #include "gl/c-ctype.h"
  41 #include "gl/c-strcase.h"
  42 #include "gl/localcharset.h"
  43 #include <gl/localename.h>
  44 #include "gl/minmax.h"
  45 #include "gl/xalloc.h"
  46 #include "gl/relocatable.h"
  47 #include "gl/xstrndup.h"
  48
  49 #include "gettext.h"
  50 #define _(msgid) gettext (msgid)
  51
  52 struct converter
  53 {
  54   char *tocode;
  55   char *fromcode;
  56   iconv_t conv;
  57   int null_char_width;
  58 };
  59
  60 static char *default_encoding;
  61 static struct hmapx map;
  62
  63 /* A wrapper around iconv_open */
  64 static struct converter *
  65 create_iconv (const char* tocode, const char* fromcode)
  66 {
  67   size_t hash;
  68   struct hmapx_node *node;
  69   struct converter *converter;
  70   assert (fromcode);
  71
  72   hash = hash_string (tocode, hash_string (fromcode, 0));
  73   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  74     {
  75       if (!converter)
  76         return NULL;
  77
  78       if (!strcmp (tocode, converter->tocode)
  79           && !strcmp (fromcode, converter->fromcode))
  80         return converter;
  81     }
  82
  83   converter = xmalloc (sizeof *converter);
  84   converter->tocode = xstrdup (tocode);
  85   converter->fromcode = xstrdup (fromcode);
  86   converter->conv = iconv_open (tocode, fromcode);
  87   int error = converter->conv == (iconv_t) ~0 ? errno : 0;
  88   /* I don't think it's safe to translate this string or to use messaging
  89      as the converters have not yet been set up */
  90   if (error && strcmp (tocode, fromcode))
  91     {
  92       fprintf (stderr,
  93                "Warning: "
  94                "cannot create a converter for `%s' to `%s': %s\n",
  95                fromcode, tocode, strerror (error));
  96
  97       free (converter->tocode);
  98       free (converter->fromcode);
  99       free (converter);
 100
 101       hmapx_insert (&map, NULL, hash);
 102       return NULL;
 103     }
 104
 105   /* Find out how many bytes there are in a null char in the target
 106      encoding */
 107   iconv_t bconv = iconv_open (tocode, "ASCII");
 108   if (bconv != (iconv_t) -1)
 109     {
 110       ICONV_CONST char inbuf[1] = "";
 111       ICONV_CONST char *inptr = inbuf;
 112       size_t inbytes = sizeof inbuf;
 113
 114       char outbuf[8];
 115       char *outptr = outbuf;
 116       size_t outbytes = sizeof outbuf;
 117       if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
 118         converter->null_char_width = outptr - outbuf;
 119       iconv_close (bconv);
 120     }
 121
 122   hmapx_insert (&map, converter, hash);
 123
 124   return converter;
 125 }
 126
 127
 128 /* Converts the single byte C from encoding FROM to TO, returning the first
 129    byte of the result.
 130
 131    This function probably shouldn't be used at all, but some code still does
 132    use it. */
 133 char
 134 recode_byte (const char *to, const char *from, char c)
 135 {
 136   char x;
 137   char *s = recode_string (to, from, &c, 1);
 138   x = s[0];
 139   free (s);
 140   return x;
 141 }
 142
 143 /* Similar to recode_string_pool, but allocates the returned value on the heap
 144    instead of in a pool.  It is the caller's responsibility to free the
 145    returned value. */
 146 char *
 147 recode_string (const char *to, const char *from,
 148                const char *text, int length)
 149 {
 150   return recode_string_pool (to, from, text, length, NULL);
 151 }
 152
 153 /* Returns the length, in bytes, of the string that a similar recode_string()
 154    call would return. */
 155 size_t
 156 recode_string_len (const char *to, const char *from,
 157                    const char *text, int length)
 158 {
 159   char *s = recode_string (to, from, text, length);
 160   size_t len = strlen (s);
 161   free (s);
 162   return len;
 163 }
 164
 165 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 166    at OP, and appends a null terminator to the output.
 167
 168    Returns the output length if successful, -1 if the output buffer is too
 169    small. */
 170 static ssize_t
 171 try_recode (struct converter *cvtr, char fallbackchar,
 172             const char *in, size_t inbytes,
 173             char *out_, size_t outbytes)
 174 {
 175   char *out = out_;
 176   int i, j;
 177
 178   int null_bytes = cvtr->null_char_width;
 179
 180   /* Put the converter into the initial shift state, in case there was any
 181      state information left over from its last usage. */
 182   iconv (cvtr->conv, NULL, 0, NULL, 0);
 183
 184   /* Do two rounds of iconv() calls:
 185
 186      - The first round does the bulk of the conversion using the
 187      caller-supplied input data..
 188
 189      - The second round flushes any leftover output.  This has a real effect
 190      with input encodings that use combining diacritics, e.g. without the
 191      second round the last character tends to gets dropped when converting
 192      from windows-1258 to other encodings.
 193   */
 194   for (i = 0; i < 2; i++)
 195     {
 196       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 197       size_t *inbytesp = i ? NULL : &inbytes;
 198
 199       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
 200         switch (errno)
 201           {
 202           case EINVAL:
 203             if (outbytes < null_bytes + 1)
 204               return -E2BIG;
 205             if (!fallbackchar)
 206               return -EINVAL;
 207             *out++ = fallbackchar;
 208             for (j = 0 ; j < null_bytes ; ++j)
 209               *out++ = '\0';
 210             return out - 1 - out_;
 211
 212           case EILSEQ:
 213             if (outbytes == 0)
 214               return -E2BIG;
 215             if (!fallbackchar)
 216               return -EILSEQ;
 217             *out++ = fallbackchar;
 218             outbytes--;
 219             if (inp)
 220               {
 221                 in++;
 222                 inbytes--;
 223               }
 224             break;
 225
 226           case E2BIG:
 227             return -E2BIG;
 228
 229           default:
 230             /* should never happen */
 231             fprintf (stderr, "Character conversion error: %s\n",
 232                      strerror (errno));
 233             NOT_REACHED ();
 234             break;
 235           }
 236     }
 237
 238   if (outbytes <= null_bytes - 1)
 239     return -E2BIG;
 240
 241   for (i = 0 ; i < null_bytes ; ++i)
 242     *out++ = '\0';
 243
 244   return out - 1 - out_;
 245 }
 246
 247 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 248    dynamically allocated string in TO-encoding.  Any characters which cannot be
 249    converted will be represented by '?'.
 250
 251    LENGTH should be the length of the string or -1, if null terminated.
 252
 253    The returned string will be allocated on POOL.
 254
 255    This function's behaviour differs from that of g_convert_with_fallback
 256    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 257    the input string is not valid in the declared input encoding.  This function
 258    however perseveres even in the presence of badly encoded input. */
 259 char *
 260 recode_string_pool (const char *to, const char *from,
 261                     const char *text, int length, struct pool *pool)
 262 {
 263   struct substring out;
 264
 265   if (text == NULL)
 266     return NULL;
 267
 268   if (length == -1)
 269     length = strlen (text);
 270
 271   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 272   return out.string;
 273 }
 274
 275 /* Returns the name of the encoding that should be used for file names.
 276
 277    This is meant to be the same encoding used by g_filename_from_uri() and
 278    g_filename_to_uri() in GLib. */
 279 static const char *
 280 filename_encoding (void)
 281 {
 282 #if defined _WIN32 || defined __WIN32__
 283   return "UTF-8";
 284 #else
 285   return locale_charset ();
 286 #endif
 287 }
 288
 289 static char *
 290 xconcat2 (const char *a, size_t a_len,
 291           const char *b, size_t b_len)
 292 {
 293   char *s = xmalloc (a_len + b_len + 1);
 294   memcpy (s, a, a_len);
 295   memcpy (s + a_len, b, b_len);
 296   s[a_len + b_len] = '\0';
 297   return s;
 298 }
 299
 300 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 301    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 302    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 303    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 304    HEAD and tries again, repeating as necessary until the concatenated result
 305    fits or until HEAD_LEN reaches 0.
 306
 307    [*] Actually this function drops grapheme clusters instead of characters, so
 308    that, e.g. a Unicode character followed by a combining accent character
 309    is either completely included or completely excluded from HEAD_LEN.  See
 310    UAX #29 at http://unicode.org/reports/tr29/ for more information on
 311    grapheme clusters.
 312
 313    A null ENCODING is treated as UTF-8.
 314
 315    Sometimes this function has to actually construct the concatenated string to
 316    measure its length.  When this happens, it sets *RESULTP to that
 317    null-terminated string, allocated with malloc(), for the caller to use if it
 318    needs it.  Otherwise, it sets *RESULTP to NULL.
 319
 320    Simple examples for encoding="UTF-8", max_len=6:
 321
 322    head="abc",  tail="xyz"     => 3
 323    head="abcd", tail="xyz"     => 3 ("d" dropped).
 324    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 325    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 326
 327    Examples for encoding="ISO-8859-1", max_len=6:
 328
 329    head="éèä",  tail="xyz"     => 6
 330    (each letter in head is only 1 byte in ISO-8859-1 even though they
 331    each take 2 bytes in UTF-8 encoding)
 332 */
 333 static size_t
 334 utf8_encoding_concat__ (const char *head, size_t head_len,
 335                         const char *tail, size_t tail_len,
 336                         const char *encoding, size_t max_len,
 337                         char **resultp)
 338 {
 339   *resultp = NULL;
 340   if (head_len == 0)
 341     return 0;
 342   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 343     {
 344       if (head_len + tail_len <= max_len)
 345         return head_len;
 346       else if (tail_len >= max_len)
 347         return 0;
 348       else
 349         {
 350           size_t copy_len;
 351           ucs4_t prev;
 352           size_t ofs;
 353           int mblen;
 354
 355           copy_len = 0;
 356           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 357                                 head_len);
 358                ofs <= max_len - tail_len;
 359                ofs += mblen)
 360             {
 361               ucs4_t next;
 362
 363               mblen = u8_mbtouc (&next,
 364                                  CHAR_CAST (const uint8_t *, head + ofs),
 365                                  head_len - ofs);
 366               if (uc_is_grapheme_break (prev, next))
 367                 copy_len = ofs;
 368
 369               prev = next;
 370             }
 371
 372           return copy_len;
 373         }
 374     }
 375   else
 376     {
 377       char *result;
 378
 379       result = (tail_len > 0
 380                 ? xconcat2 (head, head_len, tail, tail_len)
 381                 : CONST_CAST (char *, head));
 382       if (recode_string_len (encoding, "UTF-8", result,
 383                              head_len + tail_len) <= max_len)
 384         {
 385           *resultp = result != head ? result : NULL;
 386           return head_len;
 387         }
 388       else
 389         {
 390           bool correct_result = false;
 391           size_t copy_len;
 392           ucs4_t prev;
 393           size_t ofs;
 394           int mblen;
 395
 396           copy_len = 0;
 397           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 398                                 head_len);
 399                ofs <= head_len;
 400                ofs += mblen)
 401             {
 402               ucs4_t next;
 403
 404               mblen = u8_mbtouc (&next,
 405                                  CHAR_CAST (const uint8_t *, head + ofs),
 406                                  head_len - ofs);
 407               if (uc_is_grapheme_break (prev, next))
 408                 {
 409                   if (result != head)
 410                     {
 411                       memcpy (result, head, ofs);
 412                       memcpy (result + ofs, tail, tail_len);
 413                       result[ofs + tail_len] = '\0';
 414                     }
 415
 416                   if (recode_string_len (encoding, "UTF-8", result,
 417                                          ofs + tail_len) <= max_len)
 418                     {
 419                       correct_result = true;
 420                       copy_len = ofs;
 421                     }
 422                   else
 423                     correct_result = false;
 424                 }
 425
 426               prev = next;
 427             }
 428
 429           if (result != head)
 430             {
 431               if (correct_result)
 432                 *resultp = result;
 433               else
 434                 free (result);
 435             }
 436
 437           return copy_len;
 438         }
 439     }
 440 }
 441
 442 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 443    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 444    string are all encoded in UTF-8.  As many characters[*] from the beginning
 445    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 446    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 447    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 448
 449    [*] Actually this function drops grapheme clusters instead of characters, so
 450    that, e.g. a Unicode character followed by a combining accent character
 451    is either completely included or completely excluded from the returned
 452    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 453    information on grapheme clusters.
 454
 455    A null ENCODING is treated as UTF-8.
 456
 457    Simple examples for encoding="UTF-8", max_len=6:
 458
 459    head="abc",  tail="xyz"     => "abcxyz"
 460    head="abcd", tail="xyz"     => "abcxyz"
 461    head="abc",  tail="uvwxyz"  => "uvwxyz"
 462    head="abc",  tail="tuvwxyz" => "tuvwxyz"
 463
 464    Examples for encoding="ISO-8859-1", max_len=6:
 465
 466    head="éèä",  tail="xyz"    => "éèäxyz"
 467    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 468    each take 2 bytes in UTF-8 encoding)
 469 */
 470 char *
 471 utf8_encoding_concat (const char *head, const char *tail,
 472                       const char *encoding, size_t max_len)
 473 {
 474   size_t tail_len = strlen (tail);
 475   size_t prefix_len;
 476   char *result;
 477
 478   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 479                                        encoding, max_len, &result);
 480   return (result != NULL
 481           ? result
 482           : xconcat2 (head, prefix_len, tail, tail_len));
 483 }
 484
 485 /* Returns the length, in bytes, of the string that would be returned by
 486    utf8_encoding_concat() if passed the same arguments, but the implementation
 487    is often more efficient. */
 488 size_t
 489 utf8_encoding_concat_len (const char *head, const char *tail,
 490                           const char *encoding, size_t max_len)
 491 {
 492   size_t tail_len = strlen (tail);
 493   size_t prefix_len;
 494   char *result;
 495
 496   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 497                                        encoding, max_len, &result);
 498   free (result);
 499   return prefix_len + tail_len;
 500 }
 501
 502 /* Returns an allocated, null-terminated string, owned by the caller,
 503    containing as many characters[*] from the beginning of S that would fit
 504    within MAX_LEN bytes if the returned string were to be re-encoded in
 505    ENCODING.  Both S and the returned string are encoded in UTF-8.
 506
 507    [*] Actually this function drops grapheme clusters instead of characters, so
 508    that, e.g. a Unicode character followed by a combining accent character
 509    is either completely included or completely excluded from the returned
 510    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 511    information on grapheme clusters.
 512
 513    A null ENCODING is treated as UTF-8.
 514 */
 515 char *
 516 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 517 {
 518   return utf8_encoding_concat (s, "", encoding, max_len);
 519 }
 520
 521 /* Returns the length, in bytes, of the string that would be returned by
 522    utf8_encoding_trunc() if passed the same arguments, but the implementation
 523    is often more efficient. */
 524 size_t
 525 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 526 {
 527   return utf8_encoding_concat_len (s, "", encoding, max_len);
 528 }
 529
 530 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 531    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 532    current locale. */
 533 char *
 534 utf8_to_filename (const char *filename)
 535 {
 536   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 537 }
 538
 539 /* Returns FILENAME converted from the filename encoding to UTF-8.
 540    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 541    current locale. */
 542 char *
 543 filename_to_utf8 (const char *filename)
 544 {
 545   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 546 }
 547
 548 static int
 549 recode_substring_pool__ (const char *to, const char *from,
 550                          struct substring text, char fallbackchar,
 551                          struct pool *pool, struct substring *out)
 552 {
 553   size_t bufsize;
 554   struct converter *conv;
 555
 556   if (to == NULL)
 557     to = default_encoding;
 558
 559   if (from == NULL)
 560     from = default_encoding;
 561
 562   conv = create_iconv (to, from);
 563
 564   if (NULL == conv)
 565     {
 566       if (fallbackchar)
 567         {
 568           out->string = pool_malloc (pool, text.length + 1);
 569           out->length = text.length;
 570           memcpy (out->string, text.string, text.length);
 571           out->string[out->length] = '\0';
 572           return 0;
 573         }
 574       else
 575         return EPROTO;
 576     }
 577
 578   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 579     {
 580       char *output = pool_malloc (pool, bufsize);
 581       ssize_t retval;
 582
 583       retval = try_recode (conv, fallbackchar, text.string, text.length,
 584                            output, bufsize);
 585       if (retval >= 0)
 586         {
 587           *out = ss_buffer (output, retval);
 588           return 0;
 589         }
 590       pool_free (pool, output);
 591
 592       if (retval != -E2BIG)
 593         return -retval;
 594     }
 595
 596   NOT_REACHED ();
 597 }
 598
 599 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 600    dynamically allocated string in TO-encoding.  Any characters which cannot be
 601    converted will be represented by '?'.
 602
 603    The returned string will be null-terminated and allocated on POOL with
 604    pool_malloc().
 605
 606    This function's behaviour differs from that of g_convert_with_fallback
 607    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 608    the input string is not valid in the declared input encoding.  This function
 609    however perseveres even in the presence of badly encoded input. */
 610 struct substring
 611 recode_substring_pool (const char *to, const char *from,
 612                        struct substring text, struct pool *pool)
 613 {
 614   struct substring out;
 615
 616   recode_substring_pool__ (to, from, text, '?', pool, &out);
 617   return out;
 618 }
 619
 620 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 621    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 622    converted null-terminated string, allocated from POOL with pool_malloc(), is
 623    stored in *OUT.  On failure, returns a positive errno value.
 624
 625    The function fails with an error if any part of the input string is not
 626    valid in the declared input encoding. */
 627 int
 628 recode_pedantically (const char *to, const char *from,
 629                      struct substring text, struct pool *pool,
 630                      struct substring *out)
 631 {
 632   int error;
 633
 634   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 635   if (error)
 636     *out = ss_empty ();
 637   return error;
 638 }
 639 \f
 640 void
 641 i18n_init (void)
 642 {
 643   setlocale (LC_ALL, "");
 644   char *allocated;
 645   bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
 646   free (allocated);
 647   textdomain (PACKAGE);
 648
 649   assert (default_encoding == NULL);
 650   default_encoding = xstrdup (locale_charset ());
 651
 652   hmapx_init (&map);
 653 }
 654
 655 const char *
 656 get_default_encoding (void)
 657 {
 658   return default_encoding;
 659 }
 660
 661 void
 662 set_default_encoding (const char *enc)
 663 {
 664   free (default_encoding);
 665   default_encoding = xstrdup (enc);
 666 }
 667
 668 /* Return the ISO two letter code for the current LC_MESSAGES
 669    locale category.  */
 670 char *
 671 get_language (void)
 672 {
 673   const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
 674   if (0 == strcmp (localename, "C"))
 675     return NULL;
 676   char *ln = xstrdup (localename);
 677   char *end = strchr (ln, '_');
 678   if (end)
 679     *end = '\0';
 680   return ln;
 681 }
 682
 683
 684 /* Attempts to set the encoding from a locale name
 685    returns true if successful.
 686    This function does not (should not!) alter the current locale.
 687 */
 688 bool
 689 set_encoding_from_locale (const char *loc)
 690 {
 691   bool ok = true;
 692   char *c_encoding;
 693   char *loc_encoding;
 694   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 695
 696   setlocale (LC_CTYPE, "C");
 697   c_encoding = xstrdup (locale_charset ());
 698
 699   setlocale (LC_CTYPE, loc);
 700   loc_encoding = xstrdup (locale_charset ());
 701
 702
 703   if (0 == strcmp (loc_encoding, c_encoding))
 704     {
 705       ok = false;
 706     }
 707
 708   setlocale (LC_CTYPE, tmp);
 709
 710   free (tmp);
 711
 712   if (ok)
 713     {
 714       free (default_encoding);
 715       default_encoding = loc_encoding;
 716     }
 717   else
 718     free (loc_encoding);
 719
 720   free (c_encoding);
 721
 722   return ok;
 723 }
 724
 725 void
 726 i18n_done (void)
 727 {
 728   struct hmapx_node *node;
 729   struct converter *cvtr;
 730
 731   HMAPX_FOR_EACH (cvtr, node, &map)
 732     {
 733       if (cvtr == NULL)
 734         continue;
 735       free (cvtr->tocode);
 736       free (cvtr->fromcode);
 737       if (cvtr->conv != (iconv_t) -1)
 738         iconv_close (cvtr->conv);
 739       free (cvtr);
 740     }
 741
 742   hmapx_destroy (&map);
 743
 744   free (default_encoding);
 745   default_encoding = NULL;
 746 }
 747
 748
 749
 750 bool
 751 valid_encoding (const char *enc)
 752 {
 753   iconv_t conv = iconv_open (UTF8, enc);
 754
 755   if (conv == (iconv_t) -1)
 756     return false;
 757
 758   iconv_close (conv);
 759
 760   return true;
 761 }
 762
 763
 764 /* Return the system local's idea of the
 765    decimal separator character */
 766 char
 767 get_system_decimal (void)
 768 {
 769   char radix_char;
 770
 771 #if HAVE_NL_LANGINFO
 772   radix_char = nl_langinfo (RADIXCHAR)[0];
 773 #else
 774   {
 775     char buf[10];
 776     snprintf (buf, sizeof buf, "%f", 2.5);
 777     radix_char = buf[1];
 778   }
 779 #endif
 780
 781   return radix_char;
 782 }
 783
 784 const char *
 785 uc_name (ucs4_t uc, char buffer[16])
 786 {
 787   if (uc >= 0x20 && uc < 0x7f)
 788     snprintf (buffer, 16, "`%c'", uc);
 789   else
 790     snprintf (buffer, 16, "U+%04X", uc);
 791   return buffer;
 792 }
 793 \f
 794 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 795
 796 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 797    with lowercase and uppercase letters treated as equal, starting from
 798    BASIS. */
 799 unsigned int
 800 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 801 {
 802   uint8_t folded_buf[2048];
 803   size_t folded_len = sizeof folded_buf;
 804   uint8_t *folded_s;
 805   unsigned int hash;
 806
 807   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 808                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 809   if (folded_s != NULL)
 810     {
 811       hash = hash_bytes (folded_s, folded_len, basis);
 812       if (folded_s != folded_buf)
 813         free (folded_s);
 814     }
 815   else
 816     {
 817       if (errno == ENOMEM)
 818         xalloc_die ();
 819       hash = hash_bytes (s, n, basis);
 820     }
 821
 822   return hash;
 823 }
 824
 825 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 826    uppercase letters treated as equal, starting from BASIS. */
 827 unsigned int
 828 utf8_hash_case_string (const char *s, unsigned int basis)
 829 {
 830   return utf8_hash_case_bytes (s, strlen (s), basis);
 831 }
 832
 833 /* Compares UTF-8 strings A and B case-insensitively.
 834    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 835 int
 836 utf8_strcasecmp (const char *a, const char *b)
 837 {
 838   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 839 }
 840
 841 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 842    case-insensitively.
 843    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 844 int
 845 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 846 {
 847   int result;
 848
 849   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 850                   CHAR_CAST (const uint8_t *, b), bn,
 851                   NULL, UNINORM_NFKD, &result))
 852     {
 853       if (errno == ENOMEM)
 854         xalloc_die ();
 855
 856       result = memcmp (a, b, MIN (an, bn));
 857       if (result == 0)
 858         result = an < bn ? -1 : an > bn;
 859     }
 860
 861   return result;
 862 }
 863
 864 static bool
 865 is_all_digits (const uint8_t *s, size_t len)
 866 {
 867   for (size_t i = 0; i < len; i++)
 868     if (!c_isdigit (s[i]))
 869       return false;
 870   return true;
 871 }
 872
 873 /* Compares UTF-8 strings A and B case-insensitively.  If the strings end in a
 874    number, then they are compared numerically.  Returns a negative value if A <
 875    B, zero if A == B, positive if A > B. */
 876 int
 877 utf8_strverscasecmp (const char *a, const char *b)
 878 {
 879   /* Normalize A. */
 880   uint8_t a_stub[64];
 881   size_t a_len = sizeof a_stub;
 882   uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
 883                                  UNINORM_NFKD, a_stub, &a_len);
 884
 885   /* Normalize B. */
 886   uint8_t b_stub[64];
 887   size_t b_len = sizeof b_stub;
 888   uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
 889                                  UNINORM_NFKD, b_stub, &b_len);
 890
 891   int result;
 892   if (!a_norm || !b_norm)
 893     {
 894       result = strcmp (a, b);
 895       goto exit;
 896     }
 897
 898   size_t len = MIN (a_len, b_len);
 899   for (size_t i = 0; i < len; i++)
 900     if (a_norm[i] != b_norm[i])
 901       {
 902         /* If both strings end in digits, compare them numerically. */
 903         if (is_all_digits (&a_norm[i], a_len - i)
 904             && is_all_digits (&b_norm[i], b_len - i))
 905           {
 906             /* Start by stripping leading zeros, since those don't matter for
 907                numerical comparison. */
 908             size_t ap, bp;
 909             for (ap = i; ap < a_len; ap++)
 910               if (a_norm[ap] != '0')
 911                 break;
 912             for (bp = i; bp < b_len; bp++)
 913               if (b_norm[bp] != '0')
 914                 break;
 915
 916             /* The number with more digits, if there is one, is larger. */
 917             size_t a_digits = a_len - ap;
 918             size_t b_digits = b_len - bp;
 919             if (a_digits != b_digits)
 920               result = a_digits > b_digits ? 1 : -1;
 921             else
 922               result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
 923           }
 924         else
 925           result = a_norm[i] > b_norm[i] ? 1 : -1;
 926         goto exit;
 927       }
 928   result = a_len < b_len ? -1 : a_len > b_len;
 929
 930 exit:
 931   if (a_norm != a_stub)
 932     free (a_norm);
 933   if (b_norm != b_stub)
 934     free (b_norm);
 935   return result;
 936 }
 937
 938 static char *
 939 utf8_casemap (const char *s,
 940               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 941                              uint8_t *, size_t *))
 942 {
 943   char *result;
 944   size_t size;
 945
 946   result = CHAR_CAST (char *,
 947                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 948                          NULL, NULL, NULL, &size));
 949   if (result == NULL)
 950     {
 951       if (errno == ENOMEM)
 952         xalloc_die ();
 953
 954       result = xstrdup (s);
 955     }
 956   return result;
 957 }
 958
 959 char *
 960 utf8_to_upper (const char *s)
 961 {
 962   return utf8_casemap (s, u8_toupper);
 963 }
 964
 965 char *
 966 utf8_to_lower (const char *s)
 967 {
 968   return utf8_casemap (s, u8_tolower);
 969 }
 970
 971 char *
 972 utf8_to_title (const char *s)
 973 {
 974   return utf8_casemap (s, u8_totitle);
 975 }
 976 \f
 977 bool
 978 get_encoding_info (struct encoding_info *e, const char *name)
 979 {
 980   const struct substring in = SS_LITERAL_INITIALIZER (
 981                                                       "\t\n\v\f\r "
 982                                                       "!\"#$%&'()*+,-./0123456789:;<=>?@"
 983                                                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 984                                                       "abcdefghijklmnopqrstuvwxyz{|}~");
 985
 986   struct substring out, cr, lf, space;
 987   bool ok;
 988
 989   memset (e, 0, sizeof *e);
 990
 991   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 992   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 993   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 994   ok = (cr.length >= 1
 995         && cr.length <= MAX_UNIT
 996         && cr.length == lf.length
 997         && cr.length == space.length);
 998   if (!ok)
 999     {
1000       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1001       ss_dealloc (&cr);
1002       ss_dealloc (&lf);
1003       ss_dealloc (&space);
1004       ss_alloc_substring (&cr, ss_cstr ("\r"));
1005       ss_alloc_substring (&lf, ss_cstr ("\n"));
1006       ss_alloc_substring (&space, ss_cstr (" "));
1007     }
1008
1009   e->unit = cr.length;
1010   memcpy (e->cr, cr.string, e->unit);
1011   memcpy (e->lf, lf.string, e->unit);
1012   memcpy (e->space, space.string, e->unit);
1013
1014   ss_dealloc (&cr);
1015   ss_dealloc (&lf);
1016   ss_dealloc (&space);
1017
1018   out = recode_substring_pool ("UTF-8", name, in, NULL);
1019   e->is_ascii_compatible = ss_equals (in, out);
1020   ss_dealloc (&out);
1021
1022   if (!e->is_ascii_compatible && e->unit == 1)
1023     {
1024       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1025       e->is_ebcdic_compatible = (out.length == 1
1026                                  && (uint8_t) out.string[0] == 0xc1);
1027       ss_dealloc (&out);
1028     }
1029   else
1030     e->is_ebcdic_compatible = false;
1031
1032   return ok;
1033 }
1034
1035 bool
1036 is_encoding_ascii_compatible (const char *encoding)
1037 {
1038   struct encoding_info e;
1039
1040   get_encoding_info (&e, encoding);
1041   return e.is_ascii_compatible;
1042 }
1043
1044 bool
1045 is_encoding_ebcdic_compatible (const char *encoding)
1046 {
1047   struct encoding_info e;
1048
1049   get_encoding_info (&e, encoding);
1050   return e.is_ebcdic_compatible;
1051 }
1052
1053 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1054    otherwise false. */
1055 bool
1056 is_encoding_supported (const char *encoding)
1057 {
1058   return (create_iconv ("UTF-8", encoding)
1059           && create_iconv (encoding, "UTF-8"));
1060 }
1061
1062 /* Returns true if E is the name of a UTF-8 encoding.
1063
1064    XXX Possibly we should test not E as a string but its properties via
1065    iconv. */
1066 bool
1067 is_encoding_utf8 (const char *e)
1068 {
1069   return ((e[0] == 'u' || e[0] == 'U')
1070           && (e[1] == 't' || e[1] == 'T')
1071           && (e[2] == 'f' || e[2] == 'F')
1072           && ((e[3] == '8' && e[4] == '\0')
1073               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1074 }
1075 \f
1076 static struct encoding_category *categories;
1077 static int n_categories;
1078
1079 static void SENTINEL (0)
1080   add_category (size_t *allocated_categories, const char *category, ...)
1081 {
1082   struct encoding_category *c;
1083   const char *encodings[16];
1084   va_list args;
1085   int i, n;
1086
1087   /* Count encoding arguments. */
1088   va_start (args, category);
1089   n = 0;
1090   while ((encodings[n] = va_arg (args, const char *)) != NULL)
1091     {
1092       const char *encoding = encodings[n];
1093       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1094         n++;
1095     }
1096   assert (n < sizeof encodings / sizeof *encodings);
1097   va_end (args);
1098
1099   if (n == 0)
1100     return;
1101
1102   if (n_categories >= *allocated_categories)
1103     categories = x2nrealloc (categories,
1104                              allocated_categories, sizeof *categories);
1105
1106   c = &categories[n_categories++];
1107   c->category = category;
1108   c->encodings = xmalloc (n * sizeof *c->encodings);
1109   for (i = 0; i < n; i++)
1110     c->encodings[i] = encodings[i];
1111   c->n_encodings = n;
1112 }
1113
1114 static void
1115 init_encoding_categories (void)
1116 {
1117   static bool inited;
1118   size_t alloc;
1119
1120   if (inited)
1121     return;
1122   inited = true;
1123
1124   alloc = 0;
1125   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1126                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1127   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1128                 NULL_SENTINEL);
1129   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1130   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1131                 "Windows-1257", NULL_SENTINEL);
1132   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1133   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1134                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1135   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1136                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1137   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1138                 "EUC-TW", NULL_SENTINEL);
1139   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1140   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1141                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1142   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1143   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1144                 NULL_SENTINEL);
1145   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1146   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1147   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1148   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1149   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1150                 NULL_SENTINEL);
1151   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1152   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1153   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1154   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1155                 NULL_SENTINEL);
1156   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1157                 NULL_SENTINEL);
1158   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1159   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1160                 NULL_SENTINEL);
1161   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1162   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1163                 NULL_SENTINEL);
1164   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1165                 NULL_SENTINEL);
1166   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1167                 "Windows-1258", NULL_SENTINEL);
1168   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1169                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1170 }
1171
1172 /* Returns an array of "struct encoding_category" that contains only the
1173    categories and encodings that the system supports. */
1174 struct encoding_category *
1175 get_encoding_categories (void)
1176 {
1177   init_encoding_categories ();
1178   return categories;
1179 }
1180
1181 /* Returns the number of elements in the array returned by
1182    get_encoding_categories().  */
1183 size_t
1184 get_n_encoding_categories (void)
1185 {
1186   init_encoding_categories ();
1187   return n_categories;
1188 }