pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
   3    2016, 2021 Free Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  17
  18 #include <config.h>
  19
  20 #include "libpspp/i18n.h"
  21
  22 #include <assert.h>
  23 #include <errno.h>
  24 #include <iconv.h>
  25 #include <langinfo.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unicase.h>
  31 #include <unigbrk.h>
  32 #include <uniwidth.h>
  33
  34 #include "libpspp/assertion.h"
  35 #include "libpspp/compiler.h"
  36 #include "libpspp/hmapx.h"
  37 #include "libpspp/hash-functions.h"
  38 #include "libpspp/misc.h"
  39 #include "libpspp/pool.h"
  40 #include "libpspp/str.h"
  41 #include "libpspp/version.h"
  42
  43 #include "gl/c-ctype.h"
  44 #include "gl/c-strcase.h"
  45 #include "gl/localcharset.h"
  46 #include <gl/localename.h>
  47 #include "gl/minmax.h"
  48 #include "gl/xalloc.h"
  49 #include "gl/relocatable.h"
  50 #include "gl/xstrndup.h"
  51
  52 #include "gettext.h"
  53 #define _(msgid) gettext (msgid)
  54
  55 struct converter
  56 {
  57   char *tocode;
  58   char *fromcode;
  59   iconv_t conv;
  60   int null_char_width;
  61 };
  62
  63 static char *default_encoding;
  64 static struct hmapx map;
  65
  66 /* A wrapper around iconv_open */
  67 static struct converter *
  68 create_iconv (const char* tocode, const char* fromcode, bool warn)
  69 {
  70   size_t hash;
  71   struct hmapx_node *node;
  72   struct converter *converter;
  73   assert (fromcode);
  74
  75   hash = hash_string (tocode, hash_string (fromcode, 0));
  76   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  77     {
  78       if (!converter)
  79         return NULL;
  80
  81       if (!strcmp (tocode, converter->tocode)
  82           && !strcmp (fromcode, converter->fromcode))
  83         return converter;
  84     }
  85
  86   converter = xmalloc (sizeof *converter);
  87   converter->tocode = xstrdup (tocode);
  88   converter->fromcode = xstrdup (fromcode);
  89   converter->conv = iconv_open (tocode, fromcode);
  90   int error = converter->conv == (iconv_t) ~0 ? errno : 0;
  91   /* I don't think it's safe to translate this string or to use messaging
  92      as the converters have not yet been set up */
  93   if (error && strcmp (tocode, fromcode))
  94     {
  95       if (warn)
  96         fprintf (stderr,
  97                  "Warning: "
  98                  "cannot create a converter for `%s' to `%s': %s\n",
  99                  fromcode, tocode, strerror (error));
 100
 101       free (converter->tocode);
 102       free (converter->fromcode);
 103       free (converter);
 104
 105       hmapx_insert (&map, NULL, hash);
 106       return NULL;
 107     }
 108
 109   /* Find out how many bytes there are in a null char in the target
 110      encoding */
 111   iconv_t bconv = iconv_open (tocode, "ASCII");
 112   if (bconv != (iconv_t) -1)
 113     {
 114       ICONV_CONST char inbuf[1] = "";
 115       ICONV_CONST char *inptr = inbuf;
 116       size_t inbytes = sizeof inbuf;
 117
 118       char outbuf[8];
 119       char *outptr = outbuf;
 120       size_t outbytes = sizeof outbuf;
 121       if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
 122         converter->null_char_width = outptr - outbuf;
 123       iconv_close (bconv);
 124     }
 125
 126   hmapx_insert (&map, converter, hash);
 127
 128   return converter;
 129 }
 130
 131
 132 /* Converts the single byte C from encoding FROM to TO, returning the first
 133    byte of the result.
 134
 135    This function probably shouldn't be used at all, but some code still does
 136    use it. */
 137 char
 138 recode_byte (const char *to, const char *from, char c)
 139 {
 140   char x;
 141   char *s = recode_string (to, from, &c, 1);
 142   x = s[0];
 143   free (s);
 144   return x;
 145 }
 146
 147 /* Similar to recode_string_pool, but allocates the returned value on the heap
 148    instead of in a pool.  It is the caller's responsibility to free the
 149    returned value. */
 150 char *
 151 recode_string (const char *to, const char *from,
 152                const char *text, int length)
 153 {
 154   return recode_string_pool (to, from, text, length, NULL);
 155 }
 156
 157 /* Returns the length, in bytes, of the string that a similar recode_string()
 158    call would return. */
 159 size_t
 160 recode_string_len (const char *to, const char *from,
 161                    const char *text, int length)
 162 {
 163   char *s = recode_string (to, from, text, length);
 164   size_t len = strlen (s);
 165   free (s);
 166   return len;
 167 }
 168
 169 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 170    at OP, and appends a null terminator to the output.
 171
 172    Returns the output length if successful, -1 if the output buffer is too
 173    small. */
 174 static ssize_t
 175 try_recode (struct converter *cvtr, char fallbackchar,
 176             const char *in, size_t inbytes,
 177             char *out_, size_t outbytes)
 178 {
 179   char *out = out_;
 180   int i, j;
 181
 182   int null_bytes = cvtr->null_char_width;
 183
 184   /* Put the converter into the initial shift state, in case there was any
 185      state information left over from its last usage. */
 186   iconv (cvtr->conv, NULL, 0, NULL, 0);
 187
 188   /* Do two rounds of iconv() calls:
 189
 190      - The first round does the bulk of the conversion using the
 191      caller-supplied input data..
 192
 193      - The second round flushes any leftover output.  This has a real effect
 194      with input encodings that use combining diacritics, e.g. without the
 195      second round the last character tends to gets dropped when converting
 196      from windows-1258 to other encodings.
 197   */
 198   for (i = 0; i < 2; i++)
 199     {
 200       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 201       size_t *inbytesp = i ? NULL : &inbytes;
 202
 203       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
 204         switch (errno)
 205           {
 206           case EINVAL:
 207             if (outbytes < null_bytes + 1)
 208               return -E2BIG;
 209             if (!fallbackchar)
 210               return -EINVAL;
 211             *out++ = fallbackchar;
 212             for (j = 0 ; j < null_bytes ; ++j)
 213               *out++ = '\0';
 214             return out - 1 - out_;
 215
 216           case EILSEQ:
 217             if (outbytes == 0)
 218               return -E2BIG;
 219             if (!fallbackchar)
 220               return -EILSEQ;
 221             *out++ = fallbackchar;
 222             outbytes--;
 223             if (inp)
 224               {
 225                 in++;
 226                 inbytes--;
 227               }
 228             break;
 229
 230           case E2BIG:
 231             return -E2BIG;
 232
 233           default:
 234             /* should never happen */
 235             fprintf (stderr, "Character conversion error: %s\n",
 236                      strerror (errno));
 237             NOT_REACHED ();
 238             break;
 239           }
 240     }
 241
 242   if (outbytes <= null_bytes - 1)
 243     return -E2BIG;
 244
 245   for (i = 0 ; i < null_bytes ; ++i)
 246     *out++ = '\0';
 247
 248   return out - 1 - out_;
 249 }
 250
 251 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 252    dynamically allocated string in TO-encoding.  Any characters which cannot be
 253    converted will be represented by '?'.
 254
 255    LENGTH should be the length of the string or -1, if null terminated.
 256
 257    The returned string will be allocated on POOL.
 258
 259    This function's behaviour differs from that of g_convert_with_fallback
 260    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 261    the input string is not valid in the declared input encoding.  This function
 262    however perseveres even in the presence of badly encoded input. */
 263 char *
 264 recode_string_pool (const char *to, const char *from,
 265                     const char *text, int length, struct pool *pool)
 266 {
 267   struct substring out;
 268
 269   if (text == NULL)
 270     return NULL;
 271
 272   if (length == -1)
 273     length = strlen (text);
 274
 275   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 276   return out.string;
 277 }
 278
 279 /* Returns the name of the encoding that should be used for file names.
 280
 281    This is meant to be the same encoding used by g_filename_from_uri() and
 282    g_filename_to_uri() in GLib. */
 283 static const char *
 284 filename_encoding (void)
 285 {
 286 #if defined _WIN32 || defined __WIN32__
 287   return "UTF-8";
 288 #else
 289   return locale_charset ();
 290 #endif
 291 }
 292
 293 static char *
 294 xconcat2 (const char *a, size_t a_len,
 295           const char *b, size_t b_len)
 296 {
 297   char *s = xmalloc (a_len + b_len + 1);
 298   memcpy (s, a, a_len);
 299   memcpy (s + a_len, b, b_len);
 300   s[a_len + b_len] = '\0';
 301   return s;
 302 }
 303
 304 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 305    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 306    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 307    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 308    HEAD and tries again, repeating as necessary until the concatenated result
 309    fits or until HEAD_LEN reaches 0.
 310
 311    [*] Actually this function drops grapheme clusters instead of characters, so
 312    that, e.g. a Unicode character followed by a combining accent character
 313    is either completely included or completely excluded from HEAD_LEN.  See
 314    UAX #29 at http://unicode.org/reports/tr29/ for more information on
 315    grapheme clusters.
 316
 317    A null ENCODING is treated as UTF-8.
 318
 319    Sometimes this function has to actually construct the concatenated string to
 320    measure its length.  When this happens, it sets *RESULTP to that
 321    null-terminated string, allocated with malloc(), for the caller to use if it
 322    needs it.  Otherwise, it sets *RESULTP to NULL.
 323
 324    Simple examples for encoding="UTF-8", max_len=6:
 325
 326    head="abc",  tail="xyz"     => 3
 327    head="abcd", tail="xyz"     => 3 ("d" dropped).
 328    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 329    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 330
 331    Examples for encoding="ISO-8859-1", max_len=6:
 332
 333    head="éèä",  tail="xyz"     => 6
 334    (each letter in head is only 1 byte in ISO-8859-1 even though they
 335    each take 2 bytes in UTF-8 encoding)
 336 */
 337 static size_t
 338 utf8_encoding_concat__ (const char *head, size_t head_len,
 339                         const char *tail, size_t tail_len,
 340                         const char *encoding, size_t max_len,
 341                         char **resultp)
 342 {
 343   *resultp = NULL;
 344   if (head_len == 0)
 345     return 0;
 346   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 347     {
 348       if (head_len + tail_len <= max_len)
 349         return head_len;
 350       else if (tail_len >= max_len)
 351         return 0;
 352       else
 353         {
 354           size_t copy_len;
 355           ucs4_t prev;
 356           size_t ofs;
 357           int mblen;
 358
 359           copy_len = 0;
 360           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 361                                 head_len);
 362                ofs <= max_len - tail_len;
 363                ofs += mblen)
 364             {
 365               ucs4_t next;
 366
 367               mblen = u8_mbtouc (&next,
 368                                  CHAR_CAST (const uint8_t *, head + ofs),
 369                                  head_len - ofs);
 370               if (uc_is_grapheme_break (prev, next))
 371                 copy_len = ofs;
 372
 373               prev = next;
 374             }
 375
 376           return copy_len;
 377         }
 378     }
 379   else
 380     {
 381       char *result;
 382
 383       result = (tail_len > 0
 384                 ? xconcat2 (head, head_len, tail, tail_len)
 385                 : CONST_CAST (char *, head));
 386       if (recode_string_len (encoding, "UTF-8", result,
 387                              head_len + tail_len) <= max_len)
 388         {
 389           *resultp = result != head ? result : NULL;
 390           return head_len;
 391         }
 392       else
 393         {
 394           bool correct_result = false;
 395           size_t copy_len;
 396           ucs4_t prev;
 397           size_t ofs;
 398           int mblen;
 399
 400           copy_len = 0;
 401           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 402                                 head_len);
 403                ofs <= head_len;
 404                ofs += mblen)
 405             {
 406               ucs4_t next;
 407
 408               mblen = u8_mbtouc (&next,
 409                                  CHAR_CAST (const uint8_t *, head + ofs),
 410                                  head_len - ofs);
 411               if (uc_is_grapheme_break (prev, next))
 412                 {
 413                   if (result != head)
 414                     {
 415                       memcpy (result, head, ofs);
 416                       memcpy (result + ofs, tail, tail_len);
 417                       result[ofs + tail_len] = '\0';
 418                     }
 419
 420                   if (recode_string_len (encoding, "UTF-8", result,
 421                                          ofs + tail_len) <= max_len)
 422                     {
 423                       correct_result = true;
 424                       copy_len = ofs;
 425                     }
 426                   else
 427                     correct_result = false;
 428                 }
 429
 430               prev = next;
 431             }
 432
 433           if (result != head)
 434             {
 435               if (correct_result)
 436                 *resultp = result;
 437               else
 438                 free (result);
 439             }
 440
 441           return copy_len;
 442         }
 443     }
 444 }
 445
 446 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 447    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 448    string are all encoded in UTF-8.  As many characters[*] from the beginning
 449    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 450    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 451    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 452
 453    [*] Actually this function drops grapheme clusters instead of characters, so
 454    that, e.g. a Unicode character followed by a combining accent character
 455    is either completely included or completely excluded from the returned
 456    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 457    information on grapheme clusters.
 458
 459    A null ENCODING is treated as UTF-8.
 460
 461    Simple examples for encoding="UTF-8", max_len=6:
 462
 463    head="abc",  tail="xyz"     => "abcxyz"
 464    head="abcd", tail="xyz"     => "abcxyz"
 465    head="abc",  tail="uvwxyz"  => "uvwxyz"
 466    head="abc",  tail="tuvwxyz" => "tuvwxyz"
 467
 468    Examples for encoding="ISO-8859-1", max_len=6:
 469
 470    head="éèä",  tail="xyz"    => "éèäxyz"
 471    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 472    each take 2 bytes in UTF-8 encoding)
 473 */
 474 char *
 475 utf8_encoding_concat (const char *head, const char *tail,
 476                       const char *encoding, size_t max_len)
 477 {
 478   size_t tail_len = strlen (tail);
 479   size_t prefix_len;
 480   char *result;
 481
 482   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 483                                        encoding, max_len, &result);
 484   return (result != NULL
 485           ? result
 486           : xconcat2 (head, prefix_len, tail, tail_len));
 487 }
 488
 489 /* Returns the length, in bytes, of the string that would be returned by
 490    utf8_encoding_concat() if passed the same arguments, but the implementation
 491    is often more efficient. */
 492 size_t
 493 utf8_encoding_concat_len (const char *head, const char *tail,
 494                           const char *encoding, size_t max_len)
 495 {
 496   size_t tail_len = strlen (tail);
 497   size_t prefix_len;
 498   char *result;
 499
 500   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 501                                        encoding, max_len, &result);
 502   free (result);
 503   return prefix_len + tail_len;
 504 }
 505
 506 /* Returns the number of display columns that would be occupied by the LENGTH
 507    bytes of UTF-8 starting at S. */
 508 size_t
 509 utf8_count_columns (const char *s_, size_t length)
 510 {
 511   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 512
 513   size_t columns = 0;
 514   for (int ofs = 0; ofs < length; )
 515     {
 516       ucs4_t uc;
 517       ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
 518       if (uc != '\t')
 519         {
 520           int width = uc_width (uc, "UTF-8");
 521           if (width > 0)
 522             columns += width;
 523         }
 524       else
 525         columns = ROUND_UP (columns + 1, 8);
 526     }
 527   return columns;
 528 }
 529
 530 /* Returns the byte offset in LENGTH-byte UTF-8 string S that is N_COLUMNS
 531    display columns into the string. */
 532 size_t
 533 utf8_columns_to_bytes (const char *s_, size_t length, size_t n_columns)
 534 {
 535   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 536
 537   size_t columns = 0;
 538   int ofs;
 539   for (ofs = 0; ofs < length && columns < n_columns; )
 540     {
 541       ucs4_t uc;
 542       ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
 543       if (uc != '\t')
 544         {
 545           int width = uc_width (uc, "UTF-8");
 546           if (width > 0)
 547             columns += width;
 548         }
 549       else
 550         columns = ROUND_UP (columns + 1, 8);
 551     }
 552   return ofs;
 553 }
 554
 555 /* Returns an allocated, null-terminated string, owned by the caller,
 556    containing as many characters[*] from the beginning of S that would fit
 557    within MAX_LEN bytes if the returned string were to be re-encoded in
 558    ENCODING.  Both S and the returned string are encoded in UTF-8.
 559
 560    [*] Actually this function drops grapheme clusters instead of characters, so
 561    that, e.g. a Unicode character followed by a combining accent character
 562    is either completely included or completely excluded from the returned
 563    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 564    information on grapheme clusters.
 565
 566    A null ENCODING is treated as UTF-8.
 567 */
 568 char *
 569 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 570 {
 571   return utf8_encoding_concat (s, "", encoding, max_len);
 572 }
 573
 574 /* Returns the length, in bytes, of the string that would be returned by
 575    utf8_encoding_trunc() if passed the same arguments, but the implementation
 576    is often more efficient. */
 577 size_t
 578 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 579 {
 580   return utf8_encoding_concat_len (s, "", encoding, max_len);
 581 }
 582
 583 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 584    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 585    current locale. */
 586 char *
 587 utf8_to_filename (const char *filename)
 588 {
 589   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 590 }
 591
 592 /* Returns FILENAME converted from the filename encoding to UTF-8.
 593    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 594    current locale. */
 595 char *
 596 filename_to_utf8 (const char *filename)
 597 {
 598   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 599 }
 600
 601 static int
 602 recode_substring_pool__ (const char *to, const char *from,
 603                          struct substring text, char fallbackchar,
 604                          struct pool *pool, struct substring *out)
 605 {
 606   size_t bufsize;
 607   struct converter *conv;
 608
 609   if (to == NULL)
 610     to = default_encoding;
 611
 612   if (from == NULL)
 613     from = default_encoding;
 614
 615   conv = create_iconv (to, from, true);
 616
 617   if (NULL == conv)
 618     {
 619       if (fallbackchar)
 620         {
 621           out->string = pool_malloc (pool, text.length + 1);
 622           out->length = text.length;
 623           memcpy (out->string, text.string, text.length);
 624           out->string[out->length] = '\0';
 625           return 0;
 626         }
 627       else
 628         return EPROTO;
 629     }
 630
 631   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 632     {
 633       char *output = pool_malloc (pool, bufsize);
 634       ssize_t retval;
 635
 636       retval = try_recode (conv, fallbackchar, text.string, text.length,
 637                            output, bufsize);
 638       if (retval >= 0)
 639         {
 640           *out = ss_buffer (output, retval);
 641           return 0;
 642         }
 643       pool_free (pool, output);
 644
 645       if (retval != -E2BIG)
 646         return -retval;
 647     }
 648
 649   NOT_REACHED ();
 650 }
 651
 652 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 653    dynamically allocated string in TO-encoding.  Any characters which cannot be
 654    converted will be represented by '?'.
 655
 656    The returned string will be null-terminated and allocated on POOL with
 657    pool_malloc().
 658
 659    This function's behaviour differs from that of g_convert_with_fallback
 660    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 661    the input string is not valid in the declared input encoding.  This function
 662    however perseveres even in the presence of badly encoded input. */
 663 struct substring
 664 recode_substring_pool (const char *to, const char *from,
 665                        struct substring text, struct pool *pool)
 666 {
 667   struct substring out;
 668
 669   recode_substring_pool__ (to, from, text, '?', pool, &out);
 670   return out;
 671 }
 672
 673 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 674    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 675    converted null-terminated string, allocated from POOL with pool_malloc(), is
 676    stored in *OUT.  On failure, returns a positive errno value.
 677
 678    The function fails with an error if any part of the input string is not
 679    valid in the declared input encoding. */
 680 int
 681 recode_pedantically (const char *to, const char *from,
 682                      struct substring text, struct pool *pool,
 683                      struct substring *out)
 684 {
 685   int error;
 686
 687   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 688   if (error)
 689     *out = ss_empty ();
 690   return error;
 691 }
 692 \f
 693 void
 694 i18n_init (void)
 695 {
 696   setlocale (LC_ALL, "");
 697   char *allocated;
 698   bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
 699   free (allocated);
 700   textdomain (PACKAGE);
 701
 702   assert (default_encoding == NULL);
 703   default_encoding = xstrdup (locale_charset ());
 704
 705   hmapx_init (&map);
 706 }
 707
 708 const char *
 709 get_default_encoding (void)
 710 {
 711   return default_encoding;
 712 }
 713
 714 void
 715 set_default_encoding (const char *enc)
 716 {
 717   free (default_encoding);
 718   default_encoding = xstrdup (enc);
 719 }
 720
 721 /* Return the ISO two letter code for the current LC_MESSAGES
 722    locale category.  */
 723 char *
 724 get_language (void)
 725 {
 726   const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
 727   if (0 == strcmp (localename, "C"))
 728     return NULL;
 729   char *ln = xstrdup (localename);
 730   char *end = strchr (ln, '_');
 731   if (end)
 732     *end = '\0';
 733   return ln;
 734 }
 735
 736
 737 /* Attempts to set the encoding from a locale name
 738    returns true if successful.
 739    This function does not (should not!) alter the current locale.
 740 */
 741 bool
 742 set_encoding_from_locale (const char *loc)
 743 {
 744   bool ok = true;
 745   char *c_encoding;
 746   char *loc_encoding;
 747   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 748
 749   setlocale (LC_CTYPE, "C");
 750   c_encoding = xstrdup (locale_charset ());
 751
 752   setlocale (LC_CTYPE, loc);
 753   loc_encoding = xstrdup (locale_charset ());
 754
 755
 756   if (0 == strcmp (loc_encoding, c_encoding))
 757     {
 758       ok = false;
 759     }
 760
 761   setlocale (LC_CTYPE, tmp);
 762
 763   free (tmp);
 764
 765   if (ok)
 766     {
 767       free (default_encoding);
 768       default_encoding = loc_encoding;
 769     }
 770   else
 771     free (loc_encoding);
 772
 773   free (c_encoding);
 774
 775   return ok;
 776 }
 777
 778 void
 779 i18n_done (void)
 780 {
 781   struct hmapx_node *node;
 782   struct converter *cvtr;
 783
 784   HMAPX_FOR_EACH (cvtr, node, &map)
 785     {
 786       if (cvtr == NULL)
 787         continue;
 788       free (cvtr->tocode);
 789       free (cvtr->fromcode);
 790       if (cvtr->conv != (iconv_t) -1)
 791         iconv_close (cvtr->conv);
 792       free (cvtr);
 793     }
 794
 795   hmapx_destroy (&map);
 796
 797   free (default_encoding);
 798   default_encoding = NULL;
 799 }
 800
 801
 802
 803 bool
 804 valid_encoding (const char *enc)
 805 {
 806   iconv_t conv = iconv_open (UTF8, enc);
 807
 808   if (conv == (iconv_t) -1)
 809     return false;
 810
 811   iconv_close (conv);
 812
 813   return true;
 814 }
 815
 816
 817 /* Return the system local's idea of the
 818    decimal separator character */
 819 char
 820 get_system_decimal (void)
 821 {
 822   char radix_char;
 823
 824 #if HAVE_NL_LANGINFO
 825   radix_char = nl_langinfo (RADIXCHAR)[0];
 826 #else
 827   {
 828     char buf[10];
 829     snprintf (buf, sizeof buf, "%f", 2.5);
 830     radix_char = buf[1];
 831   }
 832 #endif
 833
 834   return radix_char;
 835 }
 836
 837 const char *
 838 uc_name (ucs4_t uc, char buffer[16])
 839 {
 840   if (uc >= 0x20 && uc < 0x7f)
 841     snprintf (buffer, 16, "`%c'", uc);
 842   else
 843     snprintf (buffer, 16, "U+%04X", uc);
 844   return buffer;
 845 }
 846 \f
 847 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 848
 849 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 850    with lowercase and uppercase letters treated as equal, starting from
 851    BASIS. */
 852 unsigned int
 853 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 854 {
 855   uint8_t folded_buf[2048];
 856   size_t folded_len = sizeof folded_buf;
 857   uint8_t *folded_s;
 858   unsigned int hash;
 859
 860   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 861                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 862   if (folded_s != NULL)
 863     {
 864       hash = hash_bytes (folded_s, folded_len, basis);
 865       if (folded_s != folded_buf)
 866         free (folded_s);
 867     }
 868   else
 869     {
 870       if (errno == ENOMEM)
 871         xalloc_die ();
 872       hash = hash_bytes (s, n, basis);
 873     }
 874
 875   return hash;
 876 }
 877
 878 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 879    uppercase letters treated as equal, starting from BASIS. */
 880 unsigned int
 881 utf8_hash_case_string (const char *s, unsigned int basis)
 882 {
 883   return utf8_hash_case_bytes (s, strlen (s), basis);
 884 }
 885
 886 /* Compares UTF-8 strings A and B case-insensitively.
 887    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 888 int
 889 utf8_strcasecmp (const char *a, const char *b)
 890 {
 891   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 892 }
 893
 894 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 895    case-insensitively.
 896    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 897 int
 898 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 899 {
 900   int result;
 901
 902   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 903                   CHAR_CAST (const uint8_t *, b), bn,
 904                   NULL, UNINORM_NFKD, &result))
 905     {
 906       if (errno == ENOMEM)
 907         xalloc_die ();
 908
 909       result = memcmp (a, b, MIN (an, bn));
 910       if (result == 0)
 911         result = an < bn ? -1 : an > bn;
 912     }
 913
 914   return result;
 915 }
 916
 917 static bool
 918 is_all_digits (const uint8_t *s, size_t len)
 919 {
 920   for (size_t i = 0; i < len; i++)
 921     if (!c_isdigit (s[i]))
 922       return false;
 923   return true;
 924 }
 925
 926 /* Compares UTF-8 strings A and B case-insensitively.  If the strings end in a
 927    number, then they are compared numerically.  Returns a negative value if A <
 928    B, zero if A == B, positive if A > B. */
 929 int
 930 utf8_strverscasecmp (const char *a, const char *b)
 931 {
 932   /* Normalize A. */
 933   uint8_t a_stub[64];
 934   size_t a_len = sizeof a_stub;
 935   uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
 936                                  UNINORM_NFKD, a_stub, &a_len);
 937
 938   /* Normalize B. */
 939   uint8_t b_stub[64];
 940   size_t b_len = sizeof b_stub;
 941   uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
 942                                  UNINORM_NFKD, b_stub, &b_len);
 943
 944   int result;
 945   if (!a_norm || !b_norm)
 946     {
 947       result = strcmp (a, b);
 948       goto exit;
 949     }
 950
 951   size_t len = MIN (a_len, b_len);
 952   for (size_t i = 0; i < len; i++)
 953     if (a_norm[i] != b_norm[i])
 954       {
 955         /* If both strings end in digits, compare them numerically. */
 956         if (is_all_digits (&a_norm[i], a_len - i)
 957             && is_all_digits (&b_norm[i], b_len - i))
 958           {
 959             /* Start by stripping leading zeros, since those don't matter for
 960                numerical comparison. */
 961             size_t ap, bp;
 962             for (ap = i; ap < a_len; ap++)
 963               if (a_norm[ap] != '0')
 964                 break;
 965             for (bp = i; bp < b_len; bp++)
 966               if (b_norm[bp] != '0')
 967                 break;
 968
 969             /* The number with more digits, if there is one, is larger. */
 970             size_t a_digits = a_len - ap;
 971             size_t b_digits = b_len - bp;
 972             if (a_digits != b_digits)
 973               result = a_digits > b_digits ? 1 : -1;
 974             else
 975               result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
 976           }
 977         else
 978           result = a_norm[i] > b_norm[i] ? 1 : -1;
 979         goto exit;
 980       }
 981   result = a_len < b_len ? -1 : a_len > b_len;
 982
 983 exit:
 984   if (a_norm != a_stub)
 985     free (a_norm);
 986   if (b_norm != b_stub)
 987     free (b_norm);
 988   return result;
 989 }
 990
 991 static char *
 992 utf8_casemap (const char *s,
 993               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 994                              uint8_t *, size_t *))
 995 {
 996   char *result;
 997   size_t size;
 998
 999   result = CHAR_CAST (char *,
1000                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
1001                          NULL, NULL, NULL, &size));
1002   if (result == NULL)
1003     {
1004       if (errno == ENOMEM)
1005         xalloc_die ();
1006
1007       result = xstrdup (s);
1008     }
1009   return result;
1010 }
1011
1012 char *
1013 utf8_to_upper (const char *s)
1014 {
1015   return utf8_casemap (s, u8_toupper);
1016 }
1017
1018 char *
1019 utf8_to_lower (const char *s)
1020 {
1021   return utf8_casemap (s, u8_tolower);
1022 }
1023
1024 char *
1025 utf8_to_title (const char *s)
1026 {
1027   return utf8_casemap (s, u8_totitle);
1028 }
1029 \f
1030 bool
1031 get_encoding_info (struct encoding_info *e, const char *name)
1032 {
1033   const struct substring in = SS_LITERAL_INITIALIZER (
1034                                                       "\t\n\v\f\r "
1035                                                       "!\"#$%&'()*+,-./0123456789:;<=>?@"
1036                                                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
1037                                                       "abcdefghijklmnopqrstuvwxyz{|}~");
1038
1039   struct substring out, cr, lf, space;
1040   bool ok;
1041
1042   memset (e, 0, sizeof *e);
1043
1044   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
1045   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
1046   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
1047   ok = (cr.length >= 1
1048         && cr.length <= MAX_UNIT
1049         && cr.length == lf.length
1050         && cr.length == space.length);
1051   if (!ok)
1052     {
1053       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1054       ss_dealloc (&cr);
1055       ss_dealloc (&lf);
1056       ss_dealloc (&space);
1057       ss_alloc_substring (&cr, ss_cstr ("\r"));
1058       ss_alloc_substring (&lf, ss_cstr ("\n"));
1059       ss_alloc_substring (&space, ss_cstr (" "));
1060     }
1061
1062   e->unit = cr.length;
1063   memcpy (e->cr, cr.string, e->unit);
1064   memcpy (e->lf, lf.string, e->unit);
1065   memcpy (e->space, space.string, e->unit);
1066
1067   ss_dealloc (&cr);
1068   ss_dealloc (&lf);
1069   ss_dealloc (&space);
1070
1071   out = recode_substring_pool ("UTF-8", name, in, NULL);
1072   e->is_ascii_compatible = ss_equals (in, out);
1073   ss_dealloc (&out);
1074
1075   if (!e->is_ascii_compatible && e->unit == 1)
1076     {
1077       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1078       e->is_ebcdic_compatible = (out.length == 1
1079                                  && (uint8_t) out.string[0] == 0xc1);
1080       ss_dealloc (&out);
1081     }
1082   else
1083     e->is_ebcdic_compatible = false;
1084
1085   return ok;
1086 }
1087
1088 bool
1089 is_encoding_ascii_compatible (const char *encoding)
1090 {
1091   struct encoding_info e;
1092
1093   get_encoding_info (&e, encoding);
1094   return e.is_ascii_compatible;
1095 }
1096
1097 bool
1098 is_encoding_ebcdic_compatible (const char *encoding)
1099 {
1100   struct encoding_info e;
1101
1102   get_encoding_info (&e, encoding);
1103   return e.is_ebcdic_compatible;
1104 }
1105
1106 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1107    otherwise false. */
1108 bool
1109 is_encoding_supported (const char *encoding)
1110 {
1111   return (create_iconv ("UTF-8", encoding, false)
1112           && create_iconv (encoding, "UTF-8", false));
1113 }
1114
1115 /* Returns true if E is the name of a UTF-8 encoding.
1116
1117    XXX Possibly we should test not E as a string but its properties via
1118    iconv. */
1119 bool
1120 is_encoding_utf8 (const char *e)
1121 {
1122   return ((e[0] == 'u' || e[0] == 'U')
1123           && (e[1] == 't' || e[1] == 'T')
1124           && (e[2] == 'f' || e[2] == 'F')
1125           && ((e[3] == '8' && e[4] == '\0')
1126               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1127 }
1128 \f
1129 static struct encoding_category *categories;
1130 static int n_categories;
1131
1132 static void SENTINEL (0)
1133   add_category (size_t *allocated_categories, const char *category, ...)
1134 {
1135   struct encoding_category *c;
1136   const char *encodings[16];
1137   va_list args;
1138   int i, n;
1139
1140   /* Count encoding arguments. */
1141   va_start (args, category);
1142   n = 0;
1143   while ((encodings[n] = va_arg (args, const char *)) != NULL)
1144     {
1145       const char *encoding = encodings[n];
1146       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1147         n++;
1148     }
1149   assert (n < sizeof encodings / sizeof *encodings);
1150   va_end (args);
1151
1152   if (n == 0)
1153     return;
1154
1155   if (n_categories >= *allocated_categories)
1156     categories = x2nrealloc (categories,
1157                              allocated_categories, sizeof *categories);
1158
1159   c = &categories[n_categories++];
1160   c->category = category;
1161   c->encodings = xmalloc (n * sizeof *c->encodings);
1162   for (i = 0; i < n; i++)
1163     c->encodings[i] = encodings[i];
1164   c->n_encodings = n;
1165 }
1166
1167 static void
1168 init_encoding_categories (void)
1169 {
1170   static bool inited;
1171   size_t alloc;
1172
1173   if (inited)
1174     return;
1175   inited = true;
1176
1177   alloc = 0;
1178   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1179                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1180   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1181                 NULL_SENTINEL);
1182   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1183   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1184                 "Windows-1257", NULL_SENTINEL);
1185   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1186   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1187                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1188   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1189                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1190   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1191                 "EUC-TW", NULL_SENTINEL);
1192   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1193   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1194                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1195   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1196   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1197                 NULL_SENTINEL);
1198   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1199   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1200   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1201   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1202   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1203                 NULL_SENTINEL);
1204   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1205   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1206   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1207   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1208                 NULL_SENTINEL);
1209   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1210                 NULL_SENTINEL);
1211   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1212   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1213                 NULL_SENTINEL);
1214   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1215   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1216                 NULL_SENTINEL);
1217   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1218                 NULL_SENTINEL);
1219   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1220                 "Windows-1258", NULL_SENTINEL);
1221   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1222                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1223 }
1224
1225 /* Returns an array of "struct encoding_category" that contains only the
1226    categories and encodings that the system supports. */
1227 struct encoding_category *
1228 get_encoding_categories (void)
1229 {
1230   init_encoding_categories ();
1231   return categories;
1232 }
1233
1234 /* Returns the number of elements in the array returned by
1235    get_encoding_categories().  */
1236 size_t
1237 get_n_encoding_categories (void)
1238 {
1239   init_encoding_categories ();
1240   return n_categories;
1241 }