pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
   3    2016, 2021 Free Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  17
  18 #include <config.h>
  19
  20 #include "libpspp/i18n.h"
  21
  22 #include <assert.h>
  23 #include <errno.h>
  24 #include <iconv.h>
  25 #include <langinfo.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unicase.h>
  31 #include <unigbrk.h>
  32
  33 #include "libpspp/assertion.h"
  34 #include "libpspp/compiler.h"
  35 #include "libpspp/hmapx.h"
  36 #include "libpspp/hash-functions.h"
  37 #include "libpspp/pool.h"
  38 #include "libpspp/str.h"
  39 #include "libpspp/version.h"
  40
  41 #include "gl/c-ctype.h"
  42 #include "gl/c-strcase.h"
  43 #include "gl/localcharset.h"
  44 #include <gl/localename.h>
  45 #include "gl/minmax.h"
  46 #include "gl/xalloc.h"
  47 #include "gl/relocatable.h"
  48 #include "gl/xstrndup.h"
  49
  50 #include "gettext.h"
  51 #define _(msgid) gettext (msgid)
  52
  53 struct converter
  54 {
  55   char *tocode;
  56   char *fromcode;
  57   iconv_t conv;
  58   int null_char_width;
  59 };
  60
  61 static char *default_encoding;
  62 static struct hmapx map;
  63
  64 /* A wrapper around iconv_open */
  65 static struct converter *
  66 create_iconv (const char* tocode, const char* fromcode, bool warn)
  67 {
  68   size_t hash;
  69   struct hmapx_node *node;
  70   struct converter *converter;
  71   assert (fromcode);
  72
  73   hash = hash_string (tocode, hash_string (fromcode, 0));
  74   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  75     {
  76       if (!converter)
  77         return NULL;
  78
  79       if (!strcmp (tocode, converter->tocode)
  80           && !strcmp (fromcode, converter->fromcode))
  81         return converter;
  82     }
  83
  84   converter = xmalloc (sizeof *converter);
  85   converter->tocode = xstrdup (tocode);
  86   converter->fromcode = xstrdup (fromcode);
  87   converter->conv = iconv_open (tocode, fromcode);
  88   int error = converter->conv == (iconv_t) ~0 ? errno : 0;
  89   /* I don't think it's safe to translate this string or to use messaging
  90      as the converters have not yet been set up */
  91   if (error && strcmp (tocode, fromcode))
  92     {
  93       if (warn)
  94         fprintf (stderr,
  95                  "Warning: "
  96                  "cannot create a converter for `%s' to `%s': %s\n",
  97                  fromcode, tocode, strerror (error));
  98
  99       free (converter->tocode);
 100       free (converter->fromcode);
 101       free (converter);
 102
 103       hmapx_insert (&map, NULL, hash);
 104       return NULL;
 105     }
 106
 107   /* Find out how many bytes there are in a null char in the target
 108      encoding */
 109   iconv_t bconv = iconv_open (tocode, "ASCII");
 110   if (bconv != (iconv_t) -1)
 111     {
 112       ICONV_CONST char inbuf[1] = "";
 113       ICONV_CONST char *inptr = inbuf;
 114       size_t inbytes = sizeof inbuf;
 115
 116       char outbuf[8];
 117       char *outptr = outbuf;
 118       size_t outbytes = sizeof outbuf;
 119       if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
 120         converter->null_char_width = outptr - outbuf;
 121       iconv_close (bconv);
 122     }
 123
 124   hmapx_insert (&map, converter, hash);
 125
 126   return converter;
 127 }
 128
 129
 130 /* Converts the single byte C from encoding FROM to TO, returning the first
 131    byte of the result.
 132
 133    This function probably shouldn't be used at all, but some code still does
 134    use it. */
 135 char
 136 recode_byte (const char *to, const char *from, char c)
 137 {
 138   char x;
 139   char *s = recode_string (to, from, &c, 1);
 140   x = s[0];
 141   free (s);
 142   return x;
 143 }
 144
 145 /* Similar to recode_string_pool, but allocates the returned value on the heap
 146    instead of in a pool.  It is the caller's responsibility to free the
 147    returned value. */
 148 char *
 149 recode_string (const char *to, const char *from,
 150                const char *text, int length)
 151 {
 152   return recode_string_pool (to, from, text, length, NULL);
 153 }
 154
 155 /* Returns the length, in bytes, of the string that a similar recode_string()
 156    call would return. */
 157 size_t
 158 recode_string_len (const char *to, const char *from,
 159                    const char *text, int length)
 160 {
 161   char *s = recode_string (to, from, text, length);
 162   size_t len = strlen (s);
 163   free (s);
 164   return len;
 165 }
 166
 167 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 168    at OP, and appends a null terminator to the output.
 169
 170    Returns the output length if successful, -1 if the output buffer is too
 171    small. */
 172 static ssize_t
 173 try_recode (struct converter *cvtr, char fallbackchar,
 174             const char *in, size_t inbytes,
 175             char *out_, size_t outbytes)
 176 {
 177   char *out = out_;
 178   int i, j;
 179
 180   int null_bytes = cvtr->null_char_width;
 181
 182   /* Put the converter into the initial shift state, in case there was any
 183      state information left over from its last usage. */
 184   iconv (cvtr->conv, NULL, 0, NULL, 0);
 185
 186   /* Do two rounds of iconv() calls:
 187
 188      - The first round does the bulk of the conversion using the
 189      caller-supplied input data..
 190
 191      - The second round flushes any leftover output.  This has a real effect
 192      with input encodings that use combining diacritics, e.g. without the
 193      second round the last character tends to gets dropped when converting
 194      from windows-1258 to other encodings.
 195   */
 196   for (i = 0; i < 2; i++)
 197     {
 198       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 199       size_t *inbytesp = i ? NULL : &inbytes;
 200
 201       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
 202         switch (errno)
 203           {
 204           case EINVAL:
 205             if (outbytes < null_bytes + 1)
 206               return -E2BIG;
 207             if (!fallbackchar)
 208               return -EINVAL;
 209             *out++ = fallbackchar;
 210             for (j = 0 ; j < null_bytes ; ++j)
 211               *out++ = '\0';
 212             return out - 1 - out_;
 213
 214           case EILSEQ:
 215             if (outbytes == 0)
 216               return -E2BIG;
 217             if (!fallbackchar)
 218               return -EILSEQ;
 219             *out++ = fallbackchar;
 220             outbytes--;
 221             if (inp)
 222               {
 223                 in++;
 224                 inbytes--;
 225               }
 226             break;
 227
 228           case E2BIG:
 229             return -E2BIG;
 230
 231           default:
 232             /* should never happen */
 233             fprintf (stderr, "Character conversion error: %s\n",
 234                      strerror (errno));
 235             NOT_REACHED ();
 236             break;
 237           }
 238     }
 239
 240   if (outbytes <= null_bytes - 1)
 241     return -E2BIG;
 242
 243   for (i = 0 ; i < null_bytes ; ++i)
 244     *out++ = '\0';
 245
 246   return out - 1 - out_;
 247 }
 248
 249 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 250    dynamically allocated string in TO-encoding.  Any characters which cannot be
 251    converted will be represented by '?'.
 252
 253    LENGTH should be the length of the string or -1, if null terminated.
 254
 255    The returned string will be allocated on POOL.
 256
 257    This function's behaviour differs from that of g_convert_with_fallback
 258    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 259    the input string is not valid in the declared input encoding.  This function
 260    however perseveres even in the presence of badly encoded input. */
 261 char *
 262 recode_string_pool (const char *to, const char *from,
 263                     const char *text, int length, struct pool *pool)
 264 {
 265   struct substring out;
 266
 267   if (text == NULL)
 268     return NULL;
 269
 270   if (length == -1)
 271     length = strlen (text);
 272
 273   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 274   return out.string;
 275 }
 276
 277 /* Returns the name of the encoding that should be used for file names.
 278
 279    This is meant to be the same encoding used by g_filename_from_uri() and
 280    g_filename_to_uri() in GLib. */
 281 static const char *
 282 filename_encoding (void)
 283 {
 284 #if defined _WIN32 || defined __WIN32__
 285   return "UTF-8";
 286 #else
 287   return locale_charset ();
 288 #endif
 289 }
 290
 291 static char *
 292 xconcat2 (const char *a, size_t a_len,
 293           const char *b, size_t b_len)
 294 {
 295   char *s = xmalloc (a_len + b_len + 1);
 296   memcpy (s, a, a_len);
 297   memcpy (s + a_len, b, b_len);
 298   s[a_len + b_len] = '\0';
 299   return s;
 300 }
 301
 302 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 303    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 304    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 305    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 306    HEAD and tries again, repeating as necessary until the concatenated result
 307    fits or until HEAD_LEN reaches 0.
 308
 309    [*] Actually this function drops grapheme clusters instead of characters, so
 310    that, e.g. a Unicode character followed by a combining accent character
 311    is either completely included or completely excluded from HEAD_LEN.  See
 312    UAX #29 at http://unicode.org/reports/tr29/ for more information on
 313    grapheme clusters.
 314
 315    A null ENCODING is treated as UTF-8.
 316
 317    Sometimes this function has to actually construct the concatenated string to
 318    measure its length.  When this happens, it sets *RESULTP to that
 319    null-terminated string, allocated with malloc(), for the caller to use if it
 320    needs it.  Otherwise, it sets *RESULTP to NULL.
 321
 322    Simple examples for encoding="UTF-8", max_len=6:
 323
 324    head="abc",  tail="xyz"     => 3
 325    head="abcd", tail="xyz"     => 3 ("d" dropped).
 326    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 327    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 328
 329    Examples for encoding="ISO-8859-1", max_len=6:
 330
 331    head="éèä",  tail="xyz"     => 6
 332    (each letter in head is only 1 byte in ISO-8859-1 even though they
 333    each take 2 bytes in UTF-8 encoding)
 334 */
 335 static size_t
 336 utf8_encoding_concat__ (const char *head, size_t head_len,
 337                         const char *tail, size_t tail_len,
 338                         const char *encoding, size_t max_len,
 339                         char **resultp)
 340 {
 341   *resultp = NULL;
 342   if (head_len == 0)
 343     return 0;
 344   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 345     {
 346       if (head_len + tail_len <= max_len)
 347         return head_len;
 348       else if (tail_len >= max_len)
 349         return 0;
 350       else
 351         {
 352           size_t copy_len;
 353           ucs4_t prev;
 354           size_t ofs;
 355           int mblen;
 356
 357           copy_len = 0;
 358           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 359                                 head_len);
 360                ofs <= max_len - tail_len;
 361                ofs += mblen)
 362             {
 363               ucs4_t next;
 364
 365               mblen = u8_mbtouc (&next,
 366                                  CHAR_CAST (const uint8_t *, head + ofs),
 367                                  head_len - ofs);
 368               if (uc_is_grapheme_break (prev, next))
 369                 copy_len = ofs;
 370
 371               prev = next;
 372             }
 373
 374           return copy_len;
 375         }
 376     }
 377   else
 378     {
 379       char *result;
 380
 381       result = (tail_len > 0
 382                 ? xconcat2 (head, head_len, tail, tail_len)
 383                 : CONST_CAST (char *, head));
 384       if (recode_string_len (encoding, "UTF-8", result,
 385                              head_len + tail_len) <= max_len)
 386         {
 387           *resultp = result != head ? result : NULL;
 388           return head_len;
 389         }
 390       else
 391         {
 392           bool correct_result = false;
 393           size_t copy_len;
 394           ucs4_t prev;
 395           size_t ofs;
 396           int mblen;
 397
 398           copy_len = 0;
 399           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 400                                 head_len);
 401                ofs <= head_len;
 402                ofs += mblen)
 403             {
 404               ucs4_t next;
 405
 406               mblen = u8_mbtouc (&next,
 407                                  CHAR_CAST (const uint8_t *, head + ofs),
 408                                  head_len - ofs);
 409               if (uc_is_grapheme_break (prev, next))
 410                 {
 411                   if (result != head)
 412                     {
 413                       memcpy (result, head, ofs);
 414                       memcpy (result + ofs, tail, tail_len);
 415                       result[ofs + tail_len] = '\0';
 416                     }
 417
 418                   if (recode_string_len (encoding, "UTF-8", result,
 419                                          ofs + tail_len) <= max_len)
 420                     {
 421                       correct_result = true;
 422                       copy_len = ofs;
 423                     }
 424                   else
 425                     correct_result = false;
 426                 }
 427
 428               prev = next;
 429             }
 430
 431           if (result != head)
 432             {
 433               if (correct_result)
 434                 *resultp = result;
 435               else
 436                 free (result);
 437             }
 438
 439           return copy_len;
 440         }
 441     }
 442 }
 443
 444 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 445    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 446    string are all encoded in UTF-8.  As many characters[*] from the beginning
 447    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 448    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 449    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 450
 451    [*] Actually this function drops grapheme clusters instead of characters, so
 452    that, e.g. a Unicode character followed by a combining accent character
 453    is either completely included or completely excluded from the returned
 454    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 455    information on grapheme clusters.
 456
 457    A null ENCODING is treated as UTF-8.
 458
 459    Simple examples for encoding="UTF-8", max_len=6:
 460
 461    head="abc",  tail="xyz"     => "abcxyz"
 462    head="abcd", tail="xyz"     => "abcxyz"
 463    head="abc",  tail="uvwxyz"  => "uvwxyz"
 464    head="abc",  tail="tuvwxyz" => "tuvwxyz"
 465
 466    Examples for encoding="ISO-8859-1", max_len=6:
 467
 468    head="éèä",  tail="xyz"    => "éèäxyz"
 469    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 470    each take 2 bytes in UTF-8 encoding)
 471 */
 472 char *
 473 utf8_encoding_concat (const char *head, const char *tail,
 474                       const char *encoding, size_t max_len)
 475 {
 476   size_t tail_len = strlen (tail);
 477   size_t prefix_len;
 478   char *result;
 479
 480   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 481                                        encoding, max_len, &result);
 482   return (result != NULL
 483           ? result
 484           : xconcat2 (head, prefix_len, tail, tail_len));
 485 }
 486
 487 /* Returns the length, in bytes, of the string that would be returned by
 488    utf8_encoding_concat() if passed the same arguments, but the implementation
 489    is often more efficient. */
 490 size_t
 491 utf8_encoding_concat_len (const char *head, const char *tail,
 492                           const char *encoding, size_t max_len)
 493 {
 494   size_t tail_len = strlen (tail);
 495   size_t prefix_len;
 496   char *result;
 497
 498   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 499                                        encoding, max_len, &result);
 500   free (result);
 501   return prefix_len + tail_len;
 502 }
 503
 504 /* Returns an allocated, null-terminated string, owned by the caller,
 505    containing as many characters[*] from the beginning of S that would fit
 506    within MAX_LEN bytes if the returned string were to be re-encoded in
 507    ENCODING.  Both S and the returned string are encoded in UTF-8.
 508
 509    [*] Actually this function drops grapheme clusters instead of characters, so
 510    that, e.g. a Unicode character followed by a combining accent character
 511    is either completely included or completely excluded from the returned
 512    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 513    information on grapheme clusters.
 514
 515    A null ENCODING is treated as UTF-8.
 516 */
 517 char *
 518 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 519 {
 520   return utf8_encoding_concat (s, "", encoding, max_len);
 521 }
 522
 523 /* Returns the length, in bytes, of the string that would be returned by
 524    utf8_encoding_trunc() if passed the same arguments, but the implementation
 525    is often more efficient. */
 526 size_t
 527 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 528 {
 529   return utf8_encoding_concat_len (s, "", encoding, max_len);
 530 }
 531
 532 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 533    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 534    current locale. */
 535 char *
 536 utf8_to_filename (const char *filename)
 537 {
 538   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 539 }
 540
 541 /* Returns FILENAME converted from the filename encoding to UTF-8.
 542    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 543    current locale. */
 544 char *
 545 filename_to_utf8 (const char *filename)
 546 {
 547   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 548 }
 549
 550 static int
 551 recode_substring_pool__ (const char *to, const char *from,
 552                          struct substring text, char fallbackchar,
 553                          struct pool *pool, struct substring *out)
 554 {
 555   size_t bufsize;
 556   struct converter *conv;
 557
 558   if (to == NULL)
 559     to = default_encoding;
 560
 561   if (from == NULL)
 562     from = default_encoding;
 563
 564   conv = create_iconv (to, from, true);
 565
 566   if (NULL == conv)
 567     {
 568       if (fallbackchar)
 569         {
 570           out->string = pool_malloc (pool, text.length + 1);
 571           out->length = text.length;
 572           memcpy (out->string, text.string, text.length);
 573           out->string[out->length] = '\0';
 574           return 0;
 575         }
 576       else
 577         return EPROTO;
 578     }
 579
 580   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 581     {
 582       char *output = pool_malloc (pool, bufsize);
 583       ssize_t retval;
 584
 585       retval = try_recode (conv, fallbackchar, text.string, text.length,
 586                            output, bufsize);
 587       if (retval >= 0)
 588         {
 589           *out = ss_buffer (output, retval);
 590           return 0;
 591         }
 592       pool_free (pool, output);
 593
 594       if (retval != -E2BIG)
 595         return -retval;
 596     }
 597
 598   NOT_REACHED ();
 599 }
 600
 601 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 602    dynamically allocated string in TO-encoding.  Any characters which cannot be
 603    converted will be represented by '?'.
 604
 605    The returned string will be null-terminated and allocated on POOL with
 606    pool_malloc().
 607
 608    This function's behaviour differs from that of g_convert_with_fallback
 609    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 610    the input string is not valid in the declared input encoding.  This function
 611    however perseveres even in the presence of badly encoded input. */
 612 struct substring
 613 recode_substring_pool (const char *to, const char *from,
 614                        struct substring text, struct pool *pool)
 615 {
 616   struct substring out;
 617
 618   recode_substring_pool__ (to, from, text, '?', pool, &out);
 619   return out;
 620 }
 621
 622 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 623    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 624    converted null-terminated string, allocated from POOL with pool_malloc(), is
 625    stored in *OUT.  On failure, returns a positive errno value.
 626
 627    The function fails with an error if any part of the input string is not
 628    valid in the declared input encoding. */
 629 int
 630 recode_pedantically (const char *to, const char *from,
 631                      struct substring text, struct pool *pool,
 632                      struct substring *out)
 633 {
 634   int error;
 635
 636   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 637   if (error)
 638     *out = ss_empty ();
 639   return error;
 640 }
 641 \f
 642 void
 643 i18n_init (void)
 644 {
 645   setlocale (LC_ALL, "");
 646   char *allocated;
 647   bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
 648   free (allocated);
 649   textdomain (PACKAGE);
 650
 651   assert (default_encoding == NULL);
 652   default_encoding = xstrdup (locale_charset ());
 653
 654   hmapx_init (&map);
 655 }
 656
 657 const char *
 658 get_default_encoding (void)
 659 {
 660   return default_encoding;
 661 }
 662
 663 void
 664 set_default_encoding (const char *enc)
 665 {
 666   free (default_encoding);
 667   default_encoding = xstrdup (enc);
 668 }
 669
 670 /* Return the ISO two letter code for the current LC_MESSAGES
 671    locale category.  */
 672 char *
 673 get_language (void)
 674 {
 675   const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
 676   if (0 == strcmp (localename, "C"))
 677     return NULL;
 678   char *ln = xstrdup (localename);
 679   char *end = strchr (ln, '_');
 680   if (end)
 681     *end = '\0';
 682   return ln;
 683 }
 684
 685
 686 /* Attempts to set the encoding from a locale name
 687    returns true if successful.
 688    This function does not (should not!) alter the current locale.
 689 */
 690 bool
 691 set_encoding_from_locale (const char *loc)
 692 {
 693   bool ok = true;
 694   char *c_encoding;
 695   char *loc_encoding;
 696   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 697
 698   setlocale (LC_CTYPE, "C");
 699   c_encoding = xstrdup (locale_charset ());
 700
 701   setlocale (LC_CTYPE, loc);
 702   loc_encoding = xstrdup (locale_charset ());
 703
 704
 705   if (0 == strcmp (loc_encoding, c_encoding))
 706     {
 707       ok = false;
 708     }
 709
 710   setlocale (LC_CTYPE, tmp);
 711
 712   free (tmp);
 713
 714   if (ok)
 715     {
 716       free (default_encoding);
 717       default_encoding = loc_encoding;
 718     }
 719   else
 720     free (loc_encoding);
 721
 722   free (c_encoding);
 723
 724   return ok;
 725 }
 726
 727 void
 728 i18n_done (void)
 729 {
 730   struct hmapx_node *node;
 731   struct converter *cvtr;
 732
 733   HMAPX_FOR_EACH (cvtr, node, &map)
 734     {
 735       if (cvtr == NULL)
 736         continue;
 737       free (cvtr->tocode);
 738       free (cvtr->fromcode);
 739       if (cvtr->conv != (iconv_t) -1)
 740         iconv_close (cvtr->conv);
 741       free (cvtr);
 742     }
 743
 744   hmapx_destroy (&map);
 745
 746   free (default_encoding);
 747   default_encoding = NULL;
 748 }
 749
 750
 751
 752 bool
 753 valid_encoding (const char *enc)
 754 {
 755   iconv_t conv = iconv_open (UTF8, enc);
 756
 757   if (conv == (iconv_t) -1)
 758     return false;
 759
 760   iconv_close (conv);
 761
 762   return true;
 763 }
 764
 765
 766 /* Return the system local's idea of the
 767    decimal separator character */
 768 char
 769 get_system_decimal (void)
 770 {
 771   char radix_char;
 772
 773 #if HAVE_NL_LANGINFO
 774   radix_char = nl_langinfo (RADIXCHAR)[0];
 775 #else
 776   {
 777     char buf[10];
 778     snprintf (buf, sizeof buf, "%f", 2.5);
 779     radix_char = buf[1];
 780   }
 781 #endif
 782
 783   return radix_char;
 784 }
 785
 786 const char *
 787 uc_name (ucs4_t uc, char buffer[16])
 788 {
 789   if (uc >= 0x20 && uc < 0x7f)
 790     snprintf (buffer, 16, "`%c'", uc);
 791   else
 792     snprintf (buffer, 16, "U+%04X", uc);
 793   return buffer;
 794 }
 795 \f
 796 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 797
 798 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 799    with lowercase and uppercase letters treated as equal, starting from
 800    BASIS. */
 801 unsigned int
 802 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 803 {
 804   uint8_t folded_buf[2048];
 805   size_t folded_len = sizeof folded_buf;
 806   uint8_t *folded_s;
 807   unsigned int hash;
 808
 809   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 810                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 811   if (folded_s != NULL)
 812     {
 813       hash = hash_bytes (folded_s, folded_len, basis);
 814       if (folded_s != folded_buf)
 815         free (folded_s);
 816     }
 817   else
 818     {
 819       if (errno == ENOMEM)
 820         xalloc_die ();
 821       hash = hash_bytes (s, n, basis);
 822     }
 823
 824   return hash;
 825 }
 826
 827 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 828    uppercase letters treated as equal, starting from BASIS. */
 829 unsigned int
 830 utf8_hash_case_string (const char *s, unsigned int basis)
 831 {
 832   return utf8_hash_case_substring (ss_cstr (s), basis);
 833 }
 834
 835 /* Returns a hash value for UTF-8 string S, with lowercase and uppercase
 836    letters treated as equal, starting from BASIS. */
 837 unsigned int
 838 utf8_hash_case_substring (struct substring s, unsigned int basis)
 839 {
 840   return utf8_hash_case_bytes (s.string, s.length, basis);
 841 }
 842
 843 /* Compares UTF-8 strings A and B case-insensitively.
 844    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 845 int
 846 utf8_strcasecmp (const char *a, const char *b)
 847 {
 848   return utf8_sscasecmp (ss_cstr (a), ss_cstr (b));
 849 }
 850
 851 int
 852 utf8_sscasecmp (struct substring a, struct substring b)
 853 {
 854   return utf8_strncasecmp (a.string, a.length, b.string, b.length);
 855 }
 856
 857 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 858    case-insensitively.
 859    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 860 int
 861 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 862 {
 863   int result;
 864
 865   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 866                   CHAR_CAST (const uint8_t *, b), bn,
 867                   NULL, UNINORM_NFKD, &result))
 868     {
 869       if (errno == ENOMEM)
 870         xalloc_die ();
 871
 872       result = memcmp (a, b, MIN (an, bn));
 873       if (result == 0)
 874         result = an < bn ? -1 : an > bn;
 875     }
 876
 877   return result;
 878 }
 879
 880 static bool
 881 is_all_digits (const uint8_t *s, size_t len)
 882 {
 883   for (size_t i = 0; i < len; i++)
 884     if (!c_isdigit (s[i]))
 885       return false;
 886   return true;
 887 }
 888
 889 /* Compares UTF-8 strings A and B case-insensitively.  If the strings end in a
 890    number, then they are compared numerically.  Returns a negative value if A <
 891    B, zero if A == B, positive if A > B. */
 892 int
 893 utf8_strverscasecmp (const char *a, const char *b)
 894 {
 895   /* Normalize A. */
 896   uint8_t a_stub[64];
 897   size_t a_len = sizeof a_stub;
 898   uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
 899                                  UNINORM_NFKD, a_stub, &a_len);
 900
 901   /* Normalize B. */
 902   uint8_t b_stub[64];
 903   size_t b_len = sizeof b_stub;
 904   uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
 905                                  UNINORM_NFKD, b_stub, &b_len);
 906
 907   int result;
 908   if (!a_norm || !b_norm)
 909     {
 910       result = strcmp (a, b);
 911       goto exit;
 912     }
 913
 914   size_t len = MIN (a_len, b_len);
 915   for (size_t i = 0; i < len; i++)
 916     if (a_norm[i] != b_norm[i])
 917       {
 918         /* If both strings end in digits, compare them numerically. */
 919         if (is_all_digits (&a_norm[i], a_len - i)
 920             && is_all_digits (&b_norm[i], b_len - i))
 921           {
 922             /* Start by stripping leading zeros, since those don't matter for
 923                numerical comparison. */
 924             size_t ap, bp;
 925             for (ap = i; ap < a_len; ap++)
 926               if (a_norm[ap] != '0')
 927                 break;
 928             for (bp = i; bp < b_len; bp++)
 929               if (b_norm[bp] != '0')
 930                 break;
 931
 932             /* The number with more digits, if there is one, is larger. */
 933             size_t a_digits = a_len - ap;
 934             size_t b_digits = b_len - bp;
 935             if (a_digits != b_digits)
 936               result = a_digits > b_digits ? 1 : -1;
 937             else
 938               result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
 939           }
 940         else
 941           result = a_norm[i] > b_norm[i] ? 1 : -1;
 942         goto exit;
 943       }
 944   result = a_len < b_len ? -1 : a_len > b_len;
 945
 946 exit:
 947   if (a_norm != a_stub)
 948     free (a_norm);
 949   if (b_norm != b_stub)
 950     free (b_norm);
 951   return result;
 952 }
 953
 954 static char *
 955 utf8_casemap (const char *s,
 956               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 957                              uint8_t *, size_t *))
 958 {
 959   char *result;
 960   size_t size;
 961
 962   result = CHAR_CAST (char *,
 963                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 964                          NULL, NULL, NULL, &size));
 965   if (result == NULL)
 966     {
 967       if (errno == ENOMEM)
 968         xalloc_die ();
 969
 970       result = xstrdup (s);
 971     }
 972   return result;
 973 }
 974
 975 char *
 976 utf8_to_upper (const char *s)
 977 {
 978   return utf8_casemap (s, u8_toupper);
 979 }
 980
 981 char *
 982 utf8_to_lower (const char *s)
 983 {
 984   return utf8_casemap (s, u8_tolower);
 985 }
 986
 987 char *
 988 utf8_to_title (const char *s)
 989 {
 990   return utf8_casemap (s, u8_totitle);
 991 }
 992 \f
 993 bool
 994 get_encoding_info (struct encoding_info *e, const char *name)
 995 {
 996   const struct substring in = SS_LITERAL_INITIALIZER (
 997                                                       "\t\n\v\f\r "
 998                                                       "!\"#$%&'()*+,-./0123456789:;<=>?@"
 999                                                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
1000                                                       "abcdefghijklmnopqrstuvwxyz{|}~");
1001
1002   struct substring out, cr, lf, space;
1003   bool ok;
1004
1005   memset (e, 0, sizeof *e);
1006
1007   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
1008   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
1009   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
1010   ok = (cr.length >= 1
1011         && cr.length <= MAX_UNIT
1012         && cr.length == lf.length
1013         && cr.length == space.length);
1014   if (!ok)
1015     {
1016       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1017       ss_dealloc (&cr);
1018       ss_dealloc (&lf);
1019       ss_dealloc (&space);
1020       ss_alloc_substring (&cr, ss_cstr ("\r"));
1021       ss_alloc_substring (&lf, ss_cstr ("\n"));
1022       ss_alloc_substring (&space, ss_cstr (" "));
1023     }
1024
1025   e->unit = cr.length;
1026   memcpy (e->cr, cr.string, e->unit);
1027   memcpy (e->lf, lf.string, e->unit);
1028   memcpy (e->space, space.string, e->unit);
1029
1030   ss_dealloc (&cr);
1031   ss_dealloc (&lf);
1032   ss_dealloc (&space);
1033
1034   out = recode_substring_pool ("UTF-8", name, in, NULL);
1035   e->is_ascii_compatible = ss_equals (in, out);
1036   ss_dealloc (&out);
1037
1038   if (!e->is_ascii_compatible && e->unit == 1)
1039     {
1040       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1041       e->is_ebcdic_compatible = (out.length == 1
1042                                  && (uint8_t) out.string[0] == 0xc1);
1043       ss_dealloc (&out);
1044     }
1045   else
1046     e->is_ebcdic_compatible = false;
1047
1048   return ok;
1049 }
1050
1051 bool
1052 is_encoding_ascii_compatible (const char *encoding)
1053 {
1054   struct encoding_info e;
1055
1056   get_encoding_info (&e, encoding);
1057   return e.is_ascii_compatible;
1058 }
1059
1060 bool
1061 is_encoding_ebcdic_compatible (const char *encoding)
1062 {
1063   struct encoding_info e;
1064
1065   get_encoding_info (&e, encoding);
1066   return e.is_ebcdic_compatible;
1067 }
1068
1069 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1070    otherwise false. */
1071 bool
1072 is_encoding_supported (const char *encoding)
1073 {
1074   return (create_iconv ("UTF-8", encoding, false)
1075           && create_iconv (encoding, "UTF-8", false));
1076 }
1077
1078 /* Returns true if E is the name of a UTF-8 encoding.
1079
1080    XXX Possibly we should test not E as a string but its properties via
1081    iconv. */
1082 bool
1083 is_encoding_utf8 (const char *e)
1084 {
1085   return ((e[0] == 'u' || e[0] == 'U')
1086           && (e[1] == 't' || e[1] == 'T')
1087           && (e[2] == 'f' || e[2] == 'F')
1088           && ((e[3] == '8' && e[4] == '\0')
1089               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1090 }
1091 \f
1092 static struct encoding_category *categories;
1093 static int n_categories;
1094
1095 static void SENTINEL (0)
1096   add_category (size_t *allocated_categories, const char *category, ...)
1097 {
1098   struct encoding_category *c;
1099   const char *encodings[16];
1100   va_list args;
1101   int i, n;
1102
1103   /* Count encoding arguments. */
1104   va_start (args, category);
1105   n = 0;
1106   while ((encodings[n] = va_arg (args, const char *)) != NULL)
1107     {
1108       const char *encoding = encodings[n];
1109       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1110         n++;
1111     }
1112   assert (n < sizeof encodings / sizeof *encodings);
1113   va_end (args);
1114
1115   if (n == 0)
1116     return;
1117
1118   if (n_categories >= *allocated_categories)
1119     categories = x2nrealloc (categories,
1120                              allocated_categories, sizeof *categories);
1121
1122   c = &categories[n_categories++];
1123   c->category = category;
1124   c->encodings = xmalloc (n * sizeof *c->encodings);
1125   for (i = 0; i < n; i++)
1126     c->encodings[i] = encodings[i];
1127   c->n_encodings = n;
1128 }
1129
1130 static void
1131 init_encoding_categories (void)
1132 {
1133   static bool inited;
1134   size_t alloc;
1135
1136   if (inited)
1137     return;
1138   inited = true;
1139
1140   alloc = 0;
1141   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1142                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1143   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1144                 NULL_SENTINEL);
1145   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1146   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1147                 "Windows-1257", NULL_SENTINEL);
1148   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1149   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1150                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1151   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1152                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1153   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1154                 "EUC-TW", NULL_SENTINEL);
1155   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1156   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1157                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1158   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1159   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1160                 NULL_SENTINEL);
1161   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1162   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1163   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1164   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1165   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1166                 NULL_SENTINEL);
1167   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1168   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1169   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1170   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1171                 NULL_SENTINEL);
1172   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1173                 NULL_SENTINEL);
1174   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1175   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1176                 NULL_SENTINEL);
1177   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1178   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1179                 NULL_SENTINEL);
1180   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1181                 NULL_SENTINEL);
1182   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1183                 "Windows-1258", NULL_SENTINEL);
1184   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1185                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1186 }
1187
1188 /* Returns an array of "struct encoding_category" that contains only the
1189    categories and encodings that the system supports. */
1190 struct encoding_category *
1191 get_encoding_categories (void)
1192 {
1193   init_encoding_categories ();
1194   return categories;
1195 }
1196
1197 /* Returns the number of elements in the array returned by
1198    get_encoding_categories().  */
1199 size_t
1200 get_n_encoding_categories (void)
1201 {
1202   init_encoding_categories ();
1203   return n_categories;
1204 }