pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <locale.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <unicase.h>
  30 #include <unigbrk.h>
  31
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/compiler.h"
  34 #include "libpspp/hmapx.h"
  35 #include "libpspp/hash-functions.h"
  36 #include "libpspp/pool.h"
  37 #include "libpspp/str.h"
  38 #include "libpspp/version.h"
  39
  40 #include "gl/c-strcase.h"
  41 #include "gl/localcharset.h"
  42 #include "gl/minmax.h"
  43 #include "gl/xalloc.h"
  44 #include "gl/relocatable.h"
  45 #include "gl/xstrndup.h"
  46
  47 #include "gettext.h"
  48 #define _(msgid) gettext (msgid)
  49
  50 struct converter
  51 {
  52   char *tocode;
  53   char *fromcode;
  54   iconv_t conv;
  55   int null_char_width;
  56 };
  57
  58 static char *default_encoding;
  59 static struct hmapx map;
  60
  61 /* A wrapper around iconv_open */
  62 static struct converter *
  63 create_iconv (const char* tocode, const char* fromcode)
  64 {
  65   size_t hash;
  66   struct hmapx_node *node;
  67   struct converter *converter;
  68   assert (fromcode);
  69
  70   hash = hash_string (tocode, hash_string (fromcode, 0));
  71   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  72     {
  73       if (!converter)
  74         return NULL;
  75
  76       if (!strcmp (tocode, converter->tocode)
  77           && !strcmp (fromcode, converter->fromcode))
  78         return converter;
  79     }
  80
  81   converter = xmalloc (sizeof *converter);
  82   converter->tocode = xstrdup (tocode);
  83   converter->fromcode = xstrdup (fromcode);
  84   converter->conv = iconv_open (tocode, fromcode);
  85   int error = converter->conv == (iconv_t) ~0 ? errno : 0;
  86   /* I don't think it's safe to translate this string or to use messaging
  87      as the converters have not yet been set up */
  88   if (error && strcmp (tocode, fromcode))
  89     {
  90       fprintf (stderr,
  91                "Warning: "
  92                "cannot create a converter for `%s' to `%s': %s\n",
  93                fromcode, tocode, strerror (error));
  94
  95       free (converter->tocode);
  96       free (converter->fromcode);
  97       free (converter);
  98
  99       hmapx_insert (&map, NULL, hash);
 100       return NULL;
 101     }
 102
 103   /* Find out how many bytes there are in a null char in the target
 104      encoding */
 105   iconv_t bconv = iconv_open (tocode, "ASCII");
 106   if (bconv != (iconv_t) -1)
 107     {
 108       ICONV_CONST  char *nullstr = strdup ("");
 109       ICONV_CONST  char *outbuf = strdup ("XXXXXXXX");
 110       ICONV_CONST  char *snullstr = nullstr;
 111       ICONV_CONST  char *soutbuf = outbuf;
 112
 113       size_t inbytes = 1;
 114       const size_t bytes = 8;
 115       size_t outbytes = bytes;
 116       if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes))
 117         converter->null_char_width = bytes - outbytes;
 118       free (snullstr);
 119       free (soutbuf);
 120       iconv_close (bconv);
 121     }
 122
 123   hmapx_insert (&map, converter, hash);
 124
 125   return converter;
 126 }
 127
 128
 129 /* Converts the single byte C from encoding FROM to TO, returning the first
 130    byte of the result.
 131
 132    This function probably shouldn't be used at all, but some code still does
 133    use it. */
 134 char
 135 recode_byte (const char *to, const char *from, char c)
 136 {
 137   char x;
 138   char *s = recode_string (to, from, &c, 1);
 139   x = s[0];
 140   free (s);
 141   return x;
 142 }
 143
 144 /* Similar to recode_string_pool, but allocates the returned value on the heap
 145    instead of in a pool.  It is the caller's responsibility to free the
 146    returned value. */
 147 char *
 148 recode_string (const char *to, const char *from,
 149                const char *text, int length)
 150 {
 151   return recode_string_pool (to, from, text, length, NULL);
 152 }
 153
 154 /* Returns the length, in bytes, of the string that a similar recode_string()
 155    call would return. */
 156 size_t
 157 recode_string_len (const char *to, const char *from,
 158                    const char *text, int length)
 159 {
 160   char *s = recode_string (to, from, text, length);
 161   size_t len = strlen (s);
 162   free (s);
 163   return len;
 164 }
 165
 166 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 167    at OP, and appends a null terminator to the output.
 168
 169    Returns the output length if successful, -1 if the output buffer is too
 170    small. */
 171 static ssize_t
 172 try_recode (struct converter *cvtr, char fallbackchar,
 173             const char *in, size_t inbytes,
 174             char *out_, size_t outbytes)
 175 {
 176   char *out = out_;
 177   int i, j;
 178
 179   int null_bytes = cvtr->null_char_width;
 180
 181   /* Put the converter into the initial shift state, in case there was any
 182      state information left over from its last usage. */
 183   iconv (cvtr->conv, NULL, 0, NULL, 0);
 184
 185   /* Do two rounds of iconv() calls:
 186
 187      - The first round does the bulk of the conversion using the
 188      caller-supplied input data..
 189
 190      - The second round flushes any leftover output.  This has a real effect
 191      with input encodings that use combining diacritics, e.g. without the
 192      second round the last character tends to gets dropped when converting
 193      from windows-1258 to other encodings.
 194   */
 195   for (i = 0; i < 2; i++)
 196     {
 197       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 198       size_t *inbytesp = i ? NULL : &inbytes;
 199
 200       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
 201         switch (errno)
 202           {
 203           case EINVAL:
 204             if (outbytes < null_bytes + 1)
 205               return -E2BIG;
 206             if (!fallbackchar)
 207               return -EINVAL;
 208             *out++ = fallbackchar;
 209             for (j = 0 ; j < null_bytes ; ++j)
 210               *out++ = '\0';
 211             return out - 1 - out_;
 212
 213           case EILSEQ:
 214             if (outbytes == 0)
 215               return -E2BIG;
 216             if (!fallbackchar)
 217               return -EILSEQ;
 218             *out++ = fallbackchar;
 219             outbytes--;
 220             if (inp)
 221               {
 222                 in++;
 223                 inbytes--;
 224               }
 225             break;
 226
 227           case E2BIG:
 228             return -E2BIG;
 229
 230           default:
 231             /* should never happen */
 232             fprintf (stderr, "Character conversion error: %s\n",
 233                      strerror (errno));
 234             NOT_REACHED ();
 235             break;
 236           }
 237     }
 238
 239   if (outbytes <= null_bytes - 1)
 240     return -E2BIG;
 241
 242   for (i = 0 ; i < null_bytes ; ++i)
 243     *out++ = '\0';
 244
 245   return out - 1 - out_;
 246 }
 247
 248 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 249    dynamically allocated string in TO-encoding.  Any characters which cannot be
 250    converted will be represented by '?'.
 251
 252    LENGTH should be the length of the string or -1, if null terminated.
 253
 254    The returned string will be allocated on POOL.
 255
 256    This function's behaviour differs from that of g_convert_with_fallback
 257    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 258    the input string is not valid in the declared input encoding.  This function
 259    however perseveres even in the presence of badly encoded input. */
 260 char *
 261 recode_string_pool (const char *to, const char *from,
 262                     const char *text, int length, struct pool *pool)
 263 {
 264   struct substring out;
 265
 266   if ( text == NULL )
 267     return NULL;
 268
 269   if ( length == -1 )
 270     length = strlen (text);
 271
 272   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 273   return out.string;
 274 }
 275
 276 /* Returns the name of the encoding that should be used for file names.
 277
 278    This is meant to be the same encoding used by g_filename_from_uri() and
 279    g_filename_to_uri() in GLib. */
 280 static const char *
 281 filename_encoding (void)
 282 {
 283 #if defined _WIN32 || defined __WIN32__
 284   return "UTF-8";
 285 #else
 286   return locale_charset ();
 287 #endif
 288 }
 289
 290 static char *
 291 xconcat2 (const char *a, size_t a_len,
 292           const char *b, size_t b_len)
 293 {
 294   char *s = xmalloc (a_len + b_len + 1);
 295   memcpy (s, a, a_len);
 296   memcpy (s + a_len, b, b_len);
 297   s[a_len + b_len] = '\0';
 298   return s;
 299 }
 300
 301 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 302    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 303    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 304    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 305    HEAD and tries again, repeating as necessary until the concatenated result
 306    fits or until HEAD_LEN reaches 0.
 307
 308    [*] Actually this function drops grapheme clusters instead of characters, so
 309    that, e.g. a Unicode character followed by a combining accent character
 310    is either completely included or completely excluded from HEAD_LEN.  See
 311    UAX #29 at http://unicode.org/reports/tr29/ for more information on
 312    grapheme clusters.
 313
 314    A null ENCODING is treated as UTF-8.
 315
 316    Sometimes this function has to actually construct the concatenated string to
 317    measure its length.  When this happens, it sets *RESULTP to that
 318    null-terminated string, allocated with malloc(), for the caller to use if it
 319    needs it.  Otherwise, it sets *RESULTP to NULL.
 320
 321    Simple examples for encoding="UTF-8", max_len=6:
 322
 323    head="abc",  tail="xyz"     => 3
 324    head="abcd", tail="xyz"     => 3 ("d" dropped).
 325    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 326    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 327
 328    Examples for encoding="ISO-8859-1", max_len=6:
 329
 330    head="éèä",  tail="xyz"     => 6
 331    (each letter in head is only 1 byte in ISO-8859-1 even though they
 332    each take 2 bytes in UTF-8 encoding)
 333 */
 334 static size_t
 335 utf8_encoding_concat__ (const char *head, size_t head_len,
 336                         const char *tail, size_t tail_len,
 337                         const char *encoding, size_t max_len,
 338                         char **resultp)
 339 {
 340   *resultp = NULL;
 341   if (head_len == 0)
 342     return 0;
 343   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 344     {
 345       if (head_len + tail_len <= max_len)
 346         return head_len;
 347       else if (tail_len >= max_len)
 348         return 0;
 349       else
 350         {
 351           size_t copy_len;
 352           ucs4_t prev;
 353           size_t ofs;
 354           int mblen;
 355
 356           copy_len = 0;
 357           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 358                                 head_len);
 359                ofs <= max_len - tail_len;
 360                ofs += mblen)
 361             {
 362               ucs4_t next;
 363
 364               mblen = u8_mbtouc (&next,
 365                                  CHAR_CAST (const uint8_t *, head + ofs),
 366                                  head_len - ofs);
 367               if (uc_is_grapheme_break (prev, next))
 368                 copy_len = ofs;
 369
 370               prev = next;
 371             }
 372
 373           return copy_len;
 374         }
 375     }
 376   else
 377     {
 378       char *result;
 379
 380       result = (tail_len > 0
 381                 ? xconcat2 (head, head_len, tail, tail_len)
 382                 : CONST_CAST (char *, head));
 383       if (recode_string_len (encoding, "UTF-8", result,
 384                              head_len + tail_len) <= max_len)
 385         {
 386           *resultp = result != head ? result : NULL;
 387           return head_len;
 388         }
 389       else
 390         {
 391           bool correct_result = false;
 392           size_t copy_len;
 393           ucs4_t prev;
 394           size_t ofs;
 395           int mblen;
 396
 397           copy_len = 0;
 398           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 399                                 head_len);
 400                ofs <= head_len;
 401                ofs += mblen)
 402             {
 403               ucs4_t next;
 404
 405               mblen = u8_mbtouc (&next,
 406                                  CHAR_CAST (const uint8_t *, head + ofs),
 407                                  head_len - ofs);
 408               if (uc_is_grapheme_break (prev, next))
 409                 {
 410                   if (result != head)
 411                     {
 412                       memcpy (result, head, ofs);
 413                       memcpy (result + ofs, tail, tail_len);
 414                       result[ofs + tail_len] = '\0';
 415                     }
 416
 417                   if (recode_string_len (encoding, "UTF-8", result,
 418                                          ofs + tail_len) <= max_len)
 419                     {
 420                       correct_result = true;
 421                       copy_len = ofs;
 422                     }
 423                   else
 424                     correct_result = false;
 425                 }
 426
 427               prev = next;
 428             }
 429
 430           if (result != head)
 431             {
 432               if (correct_result)
 433                 *resultp = result;
 434               else
 435                 free (result);
 436             }
 437
 438           return copy_len;
 439         }
 440     }
 441 }
 442
 443 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 444    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 445    string are all encoded in UTF-8.  As many characters[*] from the beginning
 446    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 447    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 448    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 449
 450    [*] Actually this function drops grapheme clusters instead of characters, so
 451    that, e.g. a Unicode character followed by a combining accent character
 452    is either completely included or completely excluded from the returned
 453    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 454    information on grapheme clusters.
 455
 456    A null ENCODING is treated as UTF-8.
 457
 458    Simple examples for encoding="UTF-8", max_len=6:
 459
 460    head="abc",  tail="xyz"     => "abcxyz"
 461    head="abcd", tail="xyz"     => "abcxyz"
 462    head="abc",  tail="uvwxyz"  => "uvwxyz"
 463    head="abc",  tail="tuvwxyz" => "tuvwxyz"
 464
 465    Examples for encoding="ISO-8859-1", max_len=6:
 466
 467    head="éèä",  tail="xyz"    => "éèäxyz"
 468    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 469    each take 2 bytes in UTF-8 encoding)
 470 */
 471 char *
 472 utf8_encoding_concat (const char *head, const char *tail,
 473                       const char *encoding, size_t max_len)
 474 {
 475   size_t tail_len = strlen (tail);
 476   size_t prefix_len;
 477   char *result;
 478
 479   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 480                                        encoding, max_len, &result);
 481   return (result != NULL
 482           ? result
 483           : xconcat2 (head, prefix_len, tail, tail_len));
 484 }
 485
 486 /* Returns the length, in bytes, of the string that would be returned by
 487    utf8_encoding_concat() if passed the same arguments, but the implementation
 488    is often more efficient. */
 489 size_t
 490 utf8_encoding_concat_len (const char *head, const char *tail,
 491                           const char *encoding, size_t max_len)
 492 {
 493   size_t tail_len = strlen (tail);
 494   size_t prefix_len;
 495   char *result;
 496
 497   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 498                                        encoding, max_len, &result);
 499   free (result);
 500   return prefix_len + tail_len;
 501 }
 502
 503 /* Returns an allocated, null-terminated string, owned by the caller,
 504    containing as many characters[*] from the beginning of S that would fit
 505    within MAX_LEN bytes if the returned string were to be re-encoded in
 506    ENCODING.  Both S and the returned string are encoded in UTF-8.
 507
 508    [*] Actually this function drops grapheme clusters instead of characters, so
 509    that, e.g. a Unicode character followed by a combining accent character
 510    is either completely included or completely excluded from the returned
 511    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 512    information on grapheme clusters.
 513
 514    A null ENCODING is treated as UTF-8.
 515 */
 516 char *
 517 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 518 {
 519   return utf8_encoding_concat (s, "", encoding, max_len);
 520 }
 521
 522 /* Returns the length, in bytes, of the string that would be returned by
 523    utf8_encoding_trunc() if passed the same arguments, but the implementation
 524    is often more efficient. */
 525 size_t
 526 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 527 {
 528   return utf8_encoding_concat_len (s, "", encoding, max_len);
 529 }
 530
 531 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 532    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 533    current locale. */
 534 char *
 535 utf8_to_filename (const char *filename)
 536 {
 537   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 538 }
 539
 540 /* Returns FILENAME converted from the filename encoding to UTF-8.
 541    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 542    current locale. */
 543 char *
 544 filename_to_utf8 (const char *filename)
 545 {
 546   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 547 }
 548
 549 static int
 550 recode_substring_pool__ (const char *to, const char *from,
 551                          struct substring text, char fallbackchar,
 552                          struct pool *pool, struct substring *out)
 553 {
 554   size_t bufsize;
 555   struct converter *conv;
 556
 557   if (to == NULL)
 558     to = default_encoding;
 559
 560   if (from == NULL)
 561     from = default_encoding;
 562
 563   conv = create_iconv (to, from);
 564
 565   if ( NULL == conv )
 566     {
 567       if (fallbackchar)
 568         {
 569           out->string = pool_malloc (pool, text.length + 1);
 570           out->length = text.length;
 571           memcpy (out->string, text.string, text.length);
 572           out->string[out->length] = '\0';
 573           return 0;
 574         }
 575       else
 576         return EPROTO;
 577     }
 578
 579   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 580     {
 581       char *output = pool_malloc (pool, bufsize);
 582       ssize_t retval;
 583
 584       retval = try_recode (conv, fallbackchar, text.string, text.length,
 585                            output, bufsize);
 586       if (retval >= 0)
 587         {
 588           *out = ss_buffer (output, retval);
 589           return 0;
 590         }
 591       pool_free (pool, output);
 592
 593       if (retval != -E2BIG)
 594         return -retval;
 595     }
 596
 597   NOT_REACHED ();
 598 }
 599
 600 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 601    dynamically allocated string in TO-encoding.  Any characters which cannot be
 602    converted will be represented by '?'.
 603
 604    The returned string will be null-terminated and allocated on POOL with
 605    pool_malloc().
 606
 607    This function's behaviour differs from that of g_convert_with_fallback
 608    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 609    the input string is not valid in the declared input encoding.  This function
 610    however perseveres even in the presence of badly encoded input. */
 611 struct substring
 612 recode_substring_pool (const char *to, const char *from,
 613                        struct substring text, struct pool *pool)
 614 {
 615   struct substring out;
 616
 617   recode_substring_pool__ (to, from, text, '?', pool, &out);
 618   return out;
 619 }
 620
 621 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 622    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 623    converted null-terminated string, allocated from POOL with pool_malloc(), is
 624    stored in *OUT.  On failure, returns a positive errno value.
 625
 626    The function fails with an error if any part of the input string is not
 627    valid in the declared input encoding. */
 628 int
 629 recode_pedantically (const char *to, const char *from,
 630                      struct substring text, struct pool *pool,
 631                      struct substring *out)
 632 {
 633   int error;
 634
 635   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 636   if (error)
 637     *out = ss_empty ();
 638   return error;
 639 }
 640 \f
 641 void
 642 i18n_init (void)
 643 {
 644   setlocale (LC_ALL, "");
 645   bindtextdomain (PACKAGE, relocate(locale_dir));
 646   textdomain (PACKAGE);
 647
 648   assert (default_encoding == NULL);
 649   default_encoding = xstrdup (locale_charset ());
 650
 651   hmapx_init (&map);
 652 }
 653
 654 const char *
 655 get_default_encoding (void)
 656 {
 657   return default_encoding;
 658 }
 659
 660 void
 661 set_default_encoding (const char *enc)
 662 {
 663   free (default_encoding);
 664   default_encoding = xstrdup (enc);
 665 }
 666
 667
 668 /* Attempts to set the encoding from a locale name
 669    returns true if successfull.
 670    This function does not (should not!) alter the current locale.
 671 */
 672 bool
 673 set_encoding_from_locale (const char *loc)
 674 {
 675   bool ok = true;
 676   char *c_encoding;
 677   char *loc_encoding;
 678   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 679
 680   setlocale (LC_CTYPE, "C");
 681   c_encoding = xstrdup (locale_charset ());
 682
 683   setlocale (LC_CTYPE, loc);
 684   loc_encoding = xstrdup (locale_charset ());
 685
 686
 687   if ( 0 == strcmp (loc_encoding, c_encoding))
 688     {
 689       ok = false;
 690     }
 691
 692   setlocale (LC_CTYPE, tmp);
 693
 694   free (tmp);
 695
 696   if (ok)
 697     {
 698       free (default_encoding);
 699       default_encoding = loc_encoding;
 700     }
 701   else
 702     free (loc_encoding);
 703
 704   free (c_encoding);
 705
 706   return ok;
 707 }
 708
 709 void
 710 i18n_done (void)
 711 {
 712   struct hmapx_node *node;
 713   struct converter *cvtr;
 714
 715   HMAPX_FOR_EACH (cvtr, node, &map)
 716     {
 717       if (cvtr == NULL)
 718         continue;
 719       free (cvtr->tocode);
 720       free (cvtr->fromcode);
 721       if (cvtr->conv != (iconv_t) -1)
 722         iconv_close (cvtr->conv);
 723       free (cvtr);
 724     }
 725
 726   hmapx_destroy (&map);
 727
 728   free (default_encoding);
 729   default_encoding = NULL;
 730 }
 731
 732
 733
 734 bool
 735 valid_encoding (const char *enc)
 736 {
 737   iconv_t conv = iconv_open (UTF8, enc);
 738
 739   if ( conv == (iconv_t) -1)
 740     return false;
 741
 742   iconv_close (conv);
 743
 744   return true;
 745 }
 746
 747
 748 /* Return the system local's idea of the
 749    decimal seperator character */
 750 char
 751 get_system_decimal (void)
 752 {
 753   char radix_char;
 754
 755 #if HAVE_NL_LANGINFO
 756   radix_char = nl_langinfo (RADIXCHAR)[0];
 757 #else
 758   {
 759     char buf[10];
 760     snprintf (buf, sizeof buf, "%f", 2.5);
 761     radix_char = buf[1];
 762   }
 763 #endif
 764
 765   return radix_char;
 766 }
 767
 768 const char *
 769 uc_name (ucs4_t uc, char buffer[16])
 770 {
 771   if (uc >= 0x20 && uc < 0x7f)
 772     snprintf (buffer, 16, "`%c'", uc);
 773   else
 774     snprintf (buffer, 16, "U+%04X", uc);
 775   return buffer;
 776 }
 777 \f
 778 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 779
 780 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 781    with lowercase and uppercase letters treated as equal, starting from
 782    BASIS. */
 783 unsigned int
 784 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 785 {
 786   uint8_t folded_buf[2048];
 787   size_t folded_len = sizeof folded_buf;
 788   uint8_t *folded_s;
 789   unsigned int hash;
 790
 791   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 792                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 793   if (folded_s != NULL)
 794     {
 795       hash = hash_bytes (folded_s, folded_len, basis);
 796       if (folded_s != folded_buf)
 797         free (folded_s);
 798     }
 799   else
 800     {
 801       if (errno == ENOMEM)
 802         xalloc_die ();
 803       hash = hash_bytes (s, n, basis);
 804     }
 805
 806   return hash;
 807 }
 808
 809 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 810    uppercase letters treated as equal, starting from BASIS. */
 811 unsigned int
 812 utf8_hash_case_string (const char *s, unsigned int basis)
 813 {
 814   return utf8_hash_case_bytes (s, strlen (s), basis);
 815 }
 816
 817 /* Compares UTF-8 strings A and B case-insensitively.
 818    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 819 int
 820 utf8_strcasecmp (const char *a, const char *b)
 821 {
 822   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 823 }
 824
 825 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 826    case-insensitively.
 827    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 828 int
 829 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 830 {
 831   int result;
 832
 833   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 834                   CHAR_CAST (const uint8_t *, b), bn,
 835                   NULL, UNINORM_NFKD, &result))
 836     {
 837       if (errno == ENOMEM)
 838         xalloc_die ();
 839
 840       result = memcmp (a, b, MIN (an, bn));
 841       if (result == 0)
 842         result = an < bn ? -1 : an > bn;
 843     }
 844
 845   return result;
 846 }
 847
 848 static char *
 849 utf8_casemap (const char *s,
 850               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 851                              uint8_t *, size_t *))
 852 {
 853   char *result;
 854   size_t size;
 855
 856   result = CHAR_CAST (char *,
 857                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 858                          NULL, NULL, NULL, &size));
 859   if (result == NULL)
 860     {
 861       if (errno == ENOMEM)
 862         xalloc_die ();
 863
 864       result = xstrdup (s);
 865     }
 866   return result;
 867 }
 868
 869 char *
 870 utf8_to_upper (const char *s)
 871 {
 872   return utf8_casemap (s, u8_toupper);
 873 }
 874
 875 char *
 876 utf8_to_lower (const char *s)
 877 {
 878   return utf8_casemap (s, u8_tolower);
 879 }
 880 \f
 881 bool
 882 get_encoding_info (struct encoding_info *e, const char *name)
 883 {
 884   const struct substring in = SS_LITERAL_INITIALIZER (
 885                                                       "\t\n\v\f\r "
 886                                                       "!\"#$%&'()*+,-./0123456789:;<=>?@"
 887                                                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 888                                                       "abcdefghijklmnopqrstuvwxyz{|}~");
 889
 890   struct substring out, cr, lf, space;
 891   bool ok;
 892
 893   memset (e, 0, sizeof *e);
 894
 895   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 896   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 897   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 898   ok = (cr.length >= 1
 899         && cr.length <= MAX_UNIT
 900         && cr.length == lf.length
 901         && cr.length == space.length);
 902   if (!ok)
 903     {
 904       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 905       ss_dealloc (&cr);
 906       ss_dealloc (&lf);
 907       ss_dealloc (&space);
 908       ss_alloc_substring (&cr, ss_cstr ("\r"));
 909       ss_alloc_substring (&lf, ss_cstr ("\n"));
 910       ss_alloc_substring (&space, ss_cstr (" "));
 911     }
 912
 913   e->unit = cr.length;
 914   memcpy (e->cr, cr.string, e->unit);
 915   memcpy (e->lf, lf.string, e->unit);
 916   memcpy (e->space, space.string, e->unit);
 917
 918   ss_dealloc (&cr);
 919   ss_dealloc (&lf);
 920   ss_dealloc (&space);
 921
 922   out = recode_substring_pool ("UTF-8", name, in, NULL);
 923   e->is_ascii_compatible = ss_equals (in, out);
 924   ss_dealloc (&out);
 925
 926   if (!e->is_ascii_compatible && e->unit == 1)
 927     {
 928       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 929       e->is_ebcdic_compatible = (out.length == 1
 930                                  && (uint8_t) out.string[0] == 0xc1);
 931       ss_dealloc (&out);
 932     }
 933   else
 934     e->is_ebcdic_compatible = false;
 935
 936   return ok;
 937 }
 938
 939 bool
 940 is_encoding_ascii_compatible (const char *encoding)
 941 {
 942   struct encoding_info e;
 943
 944   get_encoding_info (&e, encoding);
 945   return e.is_ascii_compatible;
 946 }
 947
 948 bool
 949 is_encoding_ebcdic_compatible (const char *encoding)
 950 {
 951   struct encoding_info e;
 952
 953   get_encoding_info (&e, encoding);
 954   return e.is_ebcdic_compatible;
 955 }
 956
 957 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 958    otherwise false. */
 959 bool
 960 is_encoding_supported (const char *encoding)
 961 {
 962   return (create_iconv ("UTF-8", encoding)
 963           && create_iconv (encoding, "UTF-8"));
 964 }
 965
 966 /* Returns true if E is the name of a UTF-8 encoding.
 967
 968    XXX Possibly we should test not E as a string but its properties via
 969    iconv. */
 970 bool
 971 is_encoding_utf8 (const char *e)
 972 {
 973   return ((e[0] == 'u' || e[0] == 'U')
 974           && (e[1] == 't' || e[1] == 'T')
 975           && (e[2] == 'f' || e[2] == 'F')
 976           && ((e[3] == '8' && e[4] == '\0')
 977               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 978 }
 979 \f
 980 static struct encoding_category *categories;
 981 static int n_categories;
 982
 983 static void SENTINEL (0)
 984   add_category (size_t *allocated_categories, const char *category, ...)
 985 {
 986   struct encoding_category *c;
 987   const char *encodings[16];
 988   va_list args;
 989   int i, n;
 990
 991   /* Count encoding arguments. */
 992   va_start (args, category);
 993   n = 0;
 994   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 995     {
 996       const char *encoding = encodings[n];
 997       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 998         n++;
 999     }
1000   assert (n < sizeof encodings / sizeof *encodings);
1001   va_end (args);
1002
1003   if (n == 0)
1004     return;
1005
1006   if (n_categories >= *allocated_categories)
1007     categories = x2nrealloc (categories,
1008                              allocated_categories, sizeof *categories);
1009
1010   c = &categories[n_categories++];
1011   c->category = category;
1012   c->encodings = xmalloc (n * sizeof *c->encodings);
1013   for (i = 0; i < n; i++)
1014     c->encodings[i] = encodings[i];
1015   c->n_encodings = n;
1016 }
1017
1018 static void
1019 init_encoding_categories (void)
1020 {
1021   static bool inited;
1022   size_t alloc;
1023
1024   if (inited)
1025     return;
1026   inited = true;
1027
1028   alloc = 0;
1029   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1030                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1031   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1032                 NULL_SENTINEL);
1033   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1034   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1035                 "Windows-1257", NULL_SENTINEL);
1036   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1037   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1038                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1039   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1040                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1041   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1042                 "EUC-TW", NULL_SENTINEL);
1043   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1044   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1045                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1046   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1047   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1048                 NULL_SENTINEL);
1049   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1050   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1051   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1052   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1053   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1054                 NULL_SENTINEL);
1055   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1056   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1057   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1058   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1059                 NULL_SENTINEL);
1060   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1061                 NULL_SENTINEL);
1062   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1063   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1064                 NULL_SENTINEL);
1065   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1066   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1067                 NULL_SENTINEL);
1068   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1069                 NULL_SENTINEL);
1070   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1071                 "Windows-1258", NULL_SENTINEL);
1072   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1073                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1074 }
1075
1076 /* Returns an array of "struct encoding_category" that contains only the
1077    categories and encodings that the system supports. */
1078 struct encoding_category *
1079 get_encoding_categories (void)
1080 {
1081   init_encoding_categories ();
1082   return categories;
1083 }
1084
1085 /* Returns the number of elements in the array returned by
1086    get_encoding_categories().  */
1087 size_t
1088 get_n_encoding_categories (void)
1089 {
1090   init_encoding_categories ();
1091   return n_categories;
1092 }