pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
   3    2016, 2021 Free Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  17
  18 #include <config.h>
  19
  20 #include "libpspp/i18n.h"
  21
  22 #include <assert.h>
  23 #include <errno.h>
  24 #include <iconv.h>
  25 #include <langinfo.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unicase.h>
  31 #include <unigbrk.h>
  32 #include <uniwidth.h>
  33
  34 #include "libpspp/assertion.h"
  35 #include "libpspp/compiler.h"
  36 #include "libpspp/hmapx.h"
  37 #include "libpspp/hash-functions.h"
  38 #include "libpspp/misc.h"
  39 #include "libpspp/pool.h"
  40 #include "libpspp/str.h"
  41 #include "libpspp/version.h"
  42
  43 #include "gl/c-ctype.h"
  44 #include "gl/c-strcase.h"
  45 #include "gl/localcharset.h"
  46 #include <gl/localename.h>
  47 #include "gl/minmax.h"
  48 #include "gl/xalloc.h"
  49 #include "gl/relocatable.h"
  50 #include "gl/xstrndup.h"
  51
  52 #include "gettext.h"
  53 #define _(msgid) gettext (msgid)
  54
  55 struct converter
  56 {
  57   char *tocode;
  58   char *fromcode;
  59   iconv_t conv;
  60   int null_char_width;
  61 };
  62
  63 static char *default_encoding;
  64 static struct hmapx map;
  65
  66 /* A wrapper around iconv_open */
  67 static struct converter *
  68 create_iconv (const char* tocode, const char* fromcode, bool warn)
  69 {
  70   size_t hash;
  71   struct hmapx_node *node;
  72   struct converter *converter;
  73   assert (fromcode);
  74
  75   hash = hash_string (tocode, hash_string (fromcode, 0));
  76   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  77     {
  78       if (!converter)
  79         return NULL;
  80
  81       if (!strcmp (tocode, converter->tocode)
  82           && !strcmp (fromcode, converter->fromcode))
  83         return converter;
  84     }
  85
  86   converter = xmalloc (sizeof *converter);
  87   converter->tocode = xstrdup (tocode);
  88   converter->fromcode = xstrdup (fromcode);
  89   converter->conv = iconv_open (tocode, fromcode);
  90   int error = converter->conv == (iconv_t) ~0 ? errno : 0;
  91   /* I don't think it's safe to translate this string or to use messaging
  92      as the converters have not yet been set up */
  93   if (error && strcmp (tocode, fromcode))
  94     {
  95       if (warn)
  96         fprintf (stderr,
  97                  "Warning: "
  98                  "cannot create a converter for `%s' to `%s': %s\n",
  99                  fromcode, tocode, strerror (error));
 100
 101       free (converter->tocode);
 102       free (converter->fromcode);
 103       free (converter);
 104
 105       hmapx_insert (&map, NULL, hash);
 106       return NULL;
 107     }
 108
 109   /* Find out how many bytes there are in a null char in the target
 110      encoding */
 111   iconv_t bconv = iconv_open (tocode, "ASCII");
 112   if (bconv != (iconv_t) -1)
 113     {
 114       ICONV_CONST char inbuf[1] = "";
 115       ICONV_CONST char *inptr = inbuf;
 116       size_t inbytes = sizeof inbuf;
 117
 118       char outbuf[8];
 119       char *outptr = outbuf;
 120       size_t outbytes = sizeof outbuf;
 121       if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
 122         converter->null_char_width = outptr - outbuf;
 123       iconv_close (bconv);
 124     }
 125
 126   hmapx_insert (&map, converter, hash);
 127
 128   return converter;
 129 }
 130
 131
 132 /* Converts the single byte C from encoding FROM to TO, returning the first
 133    byte of the result.
 134
 135    This function probably shouldn't be used at all, but some code still does
 136    use it. */
 137 char
 138 recode_byte (const char *to, const char *from, char c)
 139 {
 140   char x;
 141   char *s = recode_string (to, from, &c, 1);
 142   x = s[0];
 143   free (s);
 144   return x;
 145 }
 146
 147 /* Similar to recode_string_pool, but allocates the returned value on the heap
 148    instead of in a pool.  It is the caller's responsibility to free the
 149    returned value. */
 150 char *
 151 recode_string (const char *to, const char *from,
 152                const char *text, int length)
 153 {
 154   return recode_string_pool (to, from, text, length, NULL);
 155 }
 156
 157 /* Returns the length, in bytes, of the string that a similar recode_string()
 158    call would return. */
 159 size_t
 160 recode_string_len (const char *to, const char *from,
 161                    const char *text, int length)
 162 {
 163   char *s = recode_string (to, from, text, length);
 164   size_t len = strlen (s);
 165   free (s);
 166   return len;
 167 }
 168
 169 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 170    at OP, and appends a null terminator to the output.
 171
 172    Returns the output length if successful, -1 if the output buffer is too
 173    small. */
 174 static ssize_t
 175 try_recode (struct converter *cvtr, char fallbackchar,
 176             const char *in, size_t inbytes,
 177             char *out_, size_t outbytes)
 178 {
 179   char *out = out_;
 180   int i, j;
 181
 182   int null_bytes = cvtr->null_char_width;
 183
 184   /* Put the converter into the initial shift state, in case there was any
 185      state information left over from its last usage. */
 186   iconv (cvtr->conv, NULL, 0, NULL, 0);
 187
 188   /* Do two rounds of iconv() calls:
 189
 190      - The first round does the bulk of the conversion using the
 191      caller-supplied input data..
 192
 193      - The second round flushes any leftover output.  This has a real effect
 194      with input encodings that use combining diacritics, e.g. without the
 195      second round the last character tends to gets dropped when converting
 196      from windows-1258 to other encodings.
 197   */
 198   for (i = 0; i < 2; i++)
 199     {
 200       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 201       size_t *inbytesp = i ? NULL : &inbytes;
 202
 203       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
 204         switch (errno)
 205           {
 206           case EINVAL:
 207             if (outbytes < null_bytes + 1)
 208               return -E2BIG;
 209             if (!fallbackchar)
 210               return -EINVAL;
 211             *out++ = fallbackchar;
 212             for (j = 0 ; j < null_bytes ; ++j)
 213               *out++ = '\0';
 214             return out - 1 - out_;
 215
 216           case EILSEQ:
 217             if (outbytes == 0)
 218               return -E2BIG;
 219             if (!fallbackchar)
 220               return -EILSEQ;
 221             *out++ = fallbackchar;
 222             outbytes--;
 223             if (inp)
 224               {
 225                 in++;
 226                 inbytes--;
 227               }
 228             break;
 229
 230           case E2BIG:
 231             return -E2BIG;
 232
 233           default:
 234             /* should never happen */
 235             fprintf (stderr, "Character conversion error: %s\n",
 236                      strerror (errno));
 237             NOT_REACHED ();
 238             break;
 239           }
 240     }
 241
 242   if (outbytes <= null_bytes - 1)
 243     return -E2BIG;
 244
 245   for (i = 0 ; i < null_bytes ; ++i)
 246     *out++ = '\0';
 247
 248   return out - 1 - out_;
 249 }
 250
 251 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 252    dynamically allocated string in TO-encoding.  Any characters which cannot be
 253    converted will be represented by '?'.
 254
 255    LENGTH should be the length of the string or -1, if null terminated.
 256
 257    The returned string will be allocated on POOL.
 258
 259    This function's behaviour differs from that of g_convert_with_fallback
 260    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 261    the input string is not valid in the declared input encoding.  This function
 262    however perseveres even in the presence of badly encoded input. */
 263 char *
 264 recode_string_pool (const char *to, const char *from,
 265                     const char *text, int length, struct pool *pool)
 266 {
 267   struct substring out;
 268
 269   if (text == NULL)
 270     return NULL;
 271
 272   if (length == -1)
 273     length = strlen (text);
 274
 275   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 276   return out.string;
 277 }
 278
 279 /* Returns the name of the encoding that should be used for file names.
 280
 281    This is meant to be the same encoding used by g_filename_from_uri() and
 282    g_filename_to_uri() in GLib. */
 283 static const char *
 284 filename_encoding (void)
 285 {
 286 #if defined _WIN32 || defined __WIN32__
 287   return "UTF-8";
 288 #else
 289   return locale_charset ();
 290 #endif
 291 }
 292
 293 static char *
 294 xconcat2 (const char *a, size_t a_len,
 295           const char *b, size_t b_len)
 296 {
 297   char *s = xmalloc (a_len + b_len + 1);
 298   memcpy (s, a, a_len);
 299   memcpy (s + a_len, b, b_len);
 300   s[a_len + b_len] = '\0';
 301   return s;
 302 }
 303
 304 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 305    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 306    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 307    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 308    HEAD and tries again, repeating as necessary until the concatenated result
 309    fits or until HEAD_LEN reaches 0.
 310
 311    [*] Actually this function drops grapheme clusters instead of characters, so
 312    that, e.g. a Unicode character followed by a combining accent character
 313    is either completely included or completely excluded from HEAD_LEN.  See
 314    UAX #29 at http://unicode.org/reports/tr29/ for more information on
 315    grapheme clusters.
 316
 317    A null ENCODING is treated as UTF-8.
 318
 319    Sometimes this function has to actually construct the concatenated string to
 320    measure its length.  When this happens, it sets *RESULTP to that
 321    null-terminated string, allocated with malloc(), for the caller to use if it
 322    needs it.  Otherwise, it sets *RESULTP to NULL.
 323
 324    Simple examples for encoding="UTF-8", max_len=6:
 325
 326    head="abc",  tail="xyz"     => 3
 327    head="abcd", tail="xyz"     => 3 ("d" dropped).
 328    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 329    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 330
 331    Examples for encoding="ISO-8859-1", max_len=6:
 332
 333    head="éèä",  tail="xyz"     => 6
 334    (each letter in head is only 1 byte in ISO-8859-1 even though they
 335    each take 2 bytes in UTF-8 encoding)
 336 */
 337 static size_t
 338 utf8_encoding_concat__ (const char *head, size_t head_len,
 339                         const char *tail, size_t tail_len,
 340                         const char *encoding, size_t max_len,
 341                         char **resultp)
 342 {
 343   *resultp = NULL;
 344   if (head_len == 0)
 345     return 0;
 346   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 347     {
 348       if (head_len + tail_len <= max_len)
 349         return head_len;
 350       else if (tail_len >= max_len)
 351         return 0;
 352       else
 353         {
 354           size_t copy_len;
 355           ucs4_t prev;
 356           size_t ofs;
 357           int mblen;
 358
 359           copy_len = 0;
 360           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 361                                 head_len);
 362                ofs <= max_len - tail_len;
 363                ofs += mblen)
 364             {
 365               ucs4_t next;
 366
 367               mblen = u8_mbtouc (&next,
 368                                  CHAR_CAST (const uint8_t *, head + ofs),
 369                                  head_len - ofs);
 370               if (uc_is_grapheme_break (prev, next))
 371                 copy_len = ofs;
 372
 373               prev = next;
 374             }
 375
 376           return copy_len;
 377         }
 378     }
 379   else
 380     {
 381       char *result;
 382
 383       result = (tail_len > 0
 384                 ? xconcat2 (head, head_len, tail, tail_len)
 385                 : CONST_CAST (char *, head));
 386       if (recode_string_len (encoding, "UTF-8", result,
 387                              head_len + tail_len) <= max_len)
 388         {
 389           *resultp = result != head ? result : NULL;
 390           return head_len;
 391         }
 392       else
 393         {
 394           bool correct_result = false;
 395           size_t copy_len;
 396           ucs4_t prev;
 397           size_t ofs;
 398           int mblen;
 399
 400           copy_len = 0;
 401           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 402                                 head_len);
 403                ofs <= head_len;
 404                ofs += mblen)
 405             {
 406               ucs4_t next;
 407
 408               mblen = u8_mbtouc (&next,
 409                                  CHAR_CAST (const uint8_t *, head + ofs),
 410                                  head_len - ofs);
 411               if (uc_is_grapheme_break (prev, next))
 412                 {
 413                   if (result != head)
 414                     {
 415                       memcpy (result, head, ofs);
 416                       memcpy (result + ofs, tail, tail_len);
 417                       result[ofs + tail_len] = '\0';
 418                     }
 419
 420                   if (recode_string_len (encoding, "UTF-8", result,
 421                                          ofs + tail_len) <= max_len)
 422                     {
 423                       correct_result = true;
 424                       copy_len = ofs;
 425                     }
 426                   else
 427                     correct_result = false;
 428                 }
 429
 430               prev = next;
 431             }
 432
 433           if (result != head)
 434             {
 435               if (correct_result)
 436                 *resultp = result;
 437               else
 438                 free (result);
 439             }
 440
 441           return copy_len;
 442         }
 443     }
 444 }
 445
 446 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 447    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 448    string are all encoded in UTF-8.  As many characters[*] from the beginning
 449    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 450    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 451    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 452
 453    [*] Actually this function drops grapheme clusters instead of characters, so
 454    that, e.g. a Unicode character followed by a combining accent character
 455    is either completely included or completely excluded from the returned
 456    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 457    information on grapheme clusters.
 458
 459    A null ENCODING is treated as UTF-8.
 460
 461    Simple examples for encoding="UTF-8", max_len=6:
 462
 463    head="abc",  tail="xyz"     => "abcxyz"
 464    head="abcd", tail="xyz"     => "abcxyz"
 465    head="abc",  tail="uvwxyz"  => "uvwxyz"
 466    head="abc",  tail="tuvwxyz" => "tuvwxyz"
 467
 468    Examples for encoding="ISO-8859-1", max_len=6:
 469
 470    head="éèä",  tail="xyz"    => "éèäxyz"
 471    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 472    each take 2 bytes in UTF-8 encoding)
 473 */
 474 char *
 475 utf8_encoding_concat (const char *head, const char *tail,
 476                       const char *encoding, size_t max_len)
 477 {
 478   size_t tail_len = strlen (tail);
 479   size_t prefix_len;
 480   char *result;
 481
 482   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 483                                        encoding, max_len, &result);
 484   return (result != NULL
 485           ? result
 486           : xconcat2 (head, prefix_len, tail, tail_len));
 487 }
 488
 489 /* Returns the length, in bytes, of the string that would be returned by
 490    utf8_encoding_concat() if passed the same arguments, but the implementation
 491    is often more efficient. */
 492 size_t
 493 utf8_encoding_concat_len (const char *head, const char *tail,
 494                           const char *encoding, size_t max_len)
 495 {
 496   size_t tail_len = strlen (tail);
 497   size_t prefix_len;
 498   char *result;
 499
 500   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 501                                        encoding, max_len, &result);
 502   free (result);
 503   return prefix_len + tail_len;
 504 }
 505
 506 /* Returns the number of display columns that would be occupied by the LENGTH
 507    bytes of UTF-8 starting at S. */
 508 size_t
 509 utf8_count_columns (const char *s_, size_t length)
 510 {
 511   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 512
 513   size_t columns = 0;
 514   for (int ofs = 0; ofs < length; )
 515     {
 516       ucs4_t uc;
 517       ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
 518       if (uc != '\t')
 519         {
 520           int width = uc_width (uc, "UTF-8");
 521           if (width > 0)
 522             columns += width;
 523         }
 524       else
 525         columns = ROUND_UP (columns + 1, 8);
 526     }
 527   return columns;
 528 }
 529
 530 /* Returns the byte offset in LENGTH-byte UTF-8 string S that is N_COLUMNS
 531    display columns into the string. */
 532 size_t
 533 utf8_columns_to_bytes (const char *s_, size_t length, size_t n_columns)
 534 {
 535   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 536
 537   size_t columns = 0;
 538   int ofs;
 539   for (ofs = 0; ofs < length && columns < n_columns; )
 540     {
 541       ucs4_t uc;
 542       ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
 543       if (uc != '\t')
 544         {
 545           int width = uc_width (uc, "UTF-8");
 546           if (width > 0)
 547             columns += width;
 548         }
 549       else
 550         columns = ROUND_UP (columns + 1, 8);
 551     }
 552   return ofs;
 553 }
 554
 555 /* Returns an allocated, null-terminated string, owned by the caller,
 556    containing as many characters[*] from the beginning of S that would fit
 557    within MAX_LEN bytes if the returned string were to be re-encoded in
 558    ENCODING.  Both S and the returned string are encoded in UTF-8.
 559
 560    [*] Actually this function drops grapheme clusters instead of characters, so
 561    that, e.g. a Unicode character followed by a combining accent character
 562    is either completely included or completely excluded from the returned
 563    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 564    information on grapheme clusters.
 565
 566    A null ENCODING is treated as UTF-8.
 567 */
 568 char *
 569 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 570 {
 571   return utf8_encoding_concat (s, "", encoding, max_len);
 572 }
 573
 574 /* Returns the length, in bytes, of the string that would be returned by
 575    utf8_encoding_trunc() if passed the same arguments, but the implementation
 576    is often more efficient. */
 577 size_t
 578 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 579 {
 580   return utf8_encoding_concat_len (s, "", encoding, max_len);
 581 }
 582
 583 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 584    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 585    current locale. */
 586 char *
 587 utf8_to_filename (const char *filename)
 588 {
 589   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 590 }
 591
 592 /* Returns FILENAME converted from the filename encoding to UTF-8.
 593    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 594    current locale. */
 595 char *
 596 filename_to_utf8 (const char *filename)
 597 {
 598   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 599 }
 600
 601 static int
 602 recode_substring_pool__ (const char *to, const char *from,
 603                          struct substring text, char fallbackchar,
 604                          struct pool *pool, struct substring *out)
 605 {
 606   size_t bufsize;
 607   struct converter *conv;
 608
 609   if (to == NULL)
 610     to = default_encoding;
 611
 612   if (from == NULL)
 613     from = default_encoding;
 614
 615   conv = create_iconv (to, from, true);
 616
 617   if (NULL == conv)
 618     {
 619       if (fallbackchar)
 620         {
 621           out->string = pool_malloc (pool, text.length + 1);
 622           out->length = text.length;
 623           memcpy (out->string, text.string, text.length);
 624           out->string[out->length] = '\0';
 625           return 0;
 626         }
 627       else
 628         return EPROTO;
 629     }
 630
 631   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 632     {
 633       char *output = pool_malloc (pool, bufsize);
 634       ssize_t retval;
 635
 636       retval = try_recode (conv, fallbackchar, text.string, text.length,
 637                            output, bufsize);
 638       if (retval >= 0)
 639         {
 640           *out = ss_buffer (output, retval);
 641           return 0;
 642         }
 643       pool_free (pool, output);
 644
 645       if (retval != -E2BIG)
 646         return -retval;
 647     }
 648
 649   NOT_REACHED ();
 650 }
 651
 652 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 653    dynamically allocated string in TO-encoding.  Any characters which cannot be
 654    converted will be represented by '?'.
 655
 656    The returned string will be null-terminated and allocated on POOL with
 657    pool_malloc().
 658
 659    This function's behaviour differs from that of g_convert_with_fallback
 660    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 661    the input string is not valid in the declared input encoding.  This function
 662    however perseveres even in the presence of badly encoded input. */
 663 struct substring
 664 recode_substring_pool (const char *to, const char *from,
 665                        struct substring text, struct pool *pool)
 666 {
 667   struct substring out;
 668
 669   recode_substring_pool__ (to, from, text, '?', pool, &out);
 670   return out;
 671 }
 672
 673 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 674    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 675    converted null-terminated string, allocated from POOL with pool_malloc(), is
 676    stored in *OUT.  On failure, returns a positive errno value.
 677
 678    The function fails with an error if any part of the input string is not
 679    valid in the declared input encoding. */
 680 int
 681 recode_pedantically (const char *to, const char *from,
 682                      struct substring text, struct pool *pool,
 683                      struct substring *out)
 684 {
 685   int error;
 686
 687   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 688   if (error)
 689     *out = ss_empty ();
 690   return error;
 691 }
 692 \f
 693 void
 694 i18n_init (void)
 695 {
 696   setlocale (LC_ALL, "");
 697   char *allocated;
 698   bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
 699   free (allocated);
 700   textdomain (PACKAGE);
 701
 702   assert (default_encoding == NULL);
 703   default_encoding = xstrdup (locale_charset ());
 704
 705   hmapx_init (&map);
 706 }
 707
 708 const char *
 709 get_default_encoding (void)
 710 {
 711   return default_encoding;
 712 }
 713
 714 void
 715 set_default_encoding (const char *enc)
 716 {
 717   free (default_encoding);
 718   default_encoding = xstrdup (enc);
 719 }
 720
 721 /* Return the ISO two letter code for the current LC_MESSAGES
 722    locale category.  */
 723 char *
 724 get_language (void)
 725 {
 726   const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
 727   if (0 == strcmp (localename, "C"))
 728     return NULL;
 729   char *ln = xstrdup (localename);
 730   char *end = strchr (ln, '_');
 731   if (end)
 732     *end = '\0';
 733   return ln;
 734 }
 735
 736
 737 /* Attempts to set the encoding from a locale name
 738    returns true if successful.
 739    This function does not (should not!) alter the current locale.
 740 */
 741 bool
 742 set_encoding_from_locale (const char *loc)
 743 {
 744   bool ok = true;
 745   char *c_encoding;
 746   char *loc_encoding;
 747   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 748
 749   setlocale (LC_CTYPE, "C");
 750   c_encoding = xstrdup (locale_charset ());
 751
 752   setlocale (LC_CTYPE, loc);
 753   loc_encoding = xstrdup (locale_charset ());
 754
 755
 756   if (0 == strcmp (loc_encoding, c_encoding))
 757     {
 758       ok = false;
 759     }
 760
 761   setlocale (LC_CTYPE, tmp);
 762
 763   free (tmp);
 764
 765   if (ok)
 766     {
 767       free (default_encoding);
 768       default_encoding = loc_encoding;
 769     }
 770   else
 771     free (loc_encoding);
 772
 773   free (c_encoding);
 774
 775   return ok;
 776 }
 777
 778 void
 779 i18n_done (void)
 780 {
 781   struct hmapx_node *node;
 782   struct converter *cvtr;
 783
 784   HMAPX_FOR_EACH (cvtr, node, &map)
 785     {
 786       if (cvtr == NULL)
 787         continue;
 788       free (cvtr->tocode);
 789       free (cvtr->fromcode);
 790       if (cvtr->conv != (iconv_t) -1)
 791         iconv_close (cvtr->conv);
 792       free (cvtr);
 793     }
 794
 795   hmapx_destroy (&map);
 796
 797   free (default_encoding);
 798   default_encoding = NULL;
 799 }
 800
 801
 802
 803 bool
 804 valid_encoding (const char *enc)
 805 {
 806   iconv_t conv = iconv_open (UTF8, enc);
 807
 808   if (conv == (iconv_t) -1)
 809     return false;
 810
 811   iconv_close (conv);
 812
 813   return true;
 814 }
 815
 816
 817 /* Return the system local's idea of the
 818    decimal separator character */
 819 char
 820 get_system_decimal (void)
 821 {
 822   char radix_char;
 823
 824 #if HAVE_NL_LANGINFO
 825   radix_char = nl_langinfo (RADIXCHAR)[0];
 826 #else
 827   {
 828     char buf[10];
 829     snprintf (buf, sizeof buf, "%f", 2.5);
 830     radix_char = buf[1];
 831   }
 832 #endif
 833
 834   return radix_char;
 835 }
 836
 837 const char *
 838 uc_name (ucs4_t uc, char buffer[16])
 839 {
 840   if (uc >= 0x20 && uc < 0x7f)
 841     snprintf (buffer, 16, "`%c'", uc);
 842   else
 843     snprintf (buffer, 16, "U+%04X", uc);
 844   return buffer;
 845 }
 846 \f
 847 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 848
 849 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 850    with lowercase and uppercase letters treated as equal, starting from
 851    BASIS. */
 852 unsigned int
 853 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 854 {
 855   uint8_t folded_buf[2048];
 856   size_t folded_len = sizeof folded_buf;
 857   uint8_t *folded_s;
 858   unsigned int hash;
 859
 860   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 861                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 862   if (folded_s != NULL)
 863     {
 864       hash = hash_bytes (folded_s, folded_len, basis);
 865       if (folded_s != folded_buf)
 866         free (folded_s);
 867     }
 868   else
 869     {
 870       if (errno == ENOMEM)
 871         xalloc_die ();
 872       hash = hash_bytes (s, n, basis);
 873     }
 874
 875   return hash;
 876 }
 877
 878 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 879    uppercase letters treated as equal, starting from BASIS. */
 880 unsigned int
 881 utf8_hash_case_string (const char *s, unsigned int basis)
 882 {
 883   return utf8_hash_case_substring (ss_cstr (s), basis);
 884 }
 885
 886 /* Returns a hash value for UTF-8 string S, with lowercase and uppercase
 887    letters treated as equal, starting from BASIS. */
 888 unsigned int
 889 utf8_hash_case_substring (struct substring s, unsigned int basis)
 890 {
 891   return utf8_hash_case_bytes (s.string, s.length, basis);
 892 }
 893
 894 /* Compares UTF-8 strings A and B case-insensitively.
 895    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 896 int
 897 utf8_strcasecmp (const char *a, const char *b)
 898 {
 899   return utf8_sscasecmp (ss_cstr (a), ss_cstr (b));
 900 }
 901
 902 int
 903 utf8_sscasecmp (struct substring a, struct substring b)
 904 {
 905   return utf8_strncasecmp (a.string, a.length, b.string, b.length);
 906 }
 907
 908 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 909    case-insensitively.
 910    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 911 int
 912 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 913 {
 914   int result;
 915
 916   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 917                   CHAR_CAST (const uint8_t *, b), bn,
 918                   NULL, UNINORM_NFKD, &result))
 919     {
 920       if (errno == ENOMEM)
 921         xalloc_die ();
 922
 923       result = memcmp (a, b, MIN (an, bn));
 924       if (result == 0)
 925         result = an < bn ? -1 : an > bn;
 926     }
 927
 928   return result;
 929 }
 930
 931 static bool
 932 is_all_digits (const uint8_t *s, size_t len)
 933 {
 934   for (size_t i = 0; i < len; i++)
 935     if (!c_isdigit (s[i]))
 936       return false;
 937   return true;
 938 }
 939
 940 /* Compares UTF-8 strings A and B case-insensitively.  If the strings end in a
 941    number, then they are compared numerically.  Returns a negative value if A <
 942    B, zero if A == B, positive if A > B. */
 943 int
 944 utf8_strverscasecmp (const char *a, const char *b)
 945 {
 946   /* Normalize A. */
 947   uint8_t a_stub[64];
 948   size_t a_len = sizeof a_stub;
 949   uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
 950                                  UNINORM_NFKD, a_stub, &a_len);
 951
 952   /* Normalize B. */
 953   uint8_t b_stub[64];
 954   size_t b_len = sizeof b_stub;
 955   uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
 956                                  UNINORM_NFKD, b_stub, &b_len);
 957
 958   int result;
 959   if (!a_norm || !b_norm)
 960     {
 961       result = strcmp (a, b);
 962       goto exit;
 963     }
 964
 965   size_t len = MIN (a_len, b_len);
 966   for (size_t i = 0; i < len; i++)
 967     if (a_norm[i] != b_norm[i])
 968       {
 969         /* If both strings end in digits, compare them numerically. */
 970         if (is_all_digits (&a_norm[i], a_len - i)
 971             && is_all_digits (&b_norm[i], b_len - i))
 972           {
 973             /* Start by stripping leading zeros, since those don't matter for
 974                numerical comparison. */
 975             size_t ap, bp;
 976             for (ap = i; ap < a_len; ap++)
 977               if (a_norm[ap] != '0')
 978                 break;
 979             for (bp = i; bp < b_len; bp++)
 980               if (b_norm[bp] != '0')
 981                 break;
 982
 983             /* The number with more digits, if there is one, is larger. */
 984             size_t a_digits = a_len - ap;
 985             size_t b_digits = b_len - bp;
 986             if (a_digits != b_digits)
 987               result = a_digits > b_digits ? 1 : -1;
 988             else
 989               result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
 990           }
 991         else
 992           result = a_norm[i] > b_norm[i] ? 1 : -1;
 993         goto exit;
 994       }
 995   result = a_len < b_len ? -1 : a_len > b_len;
 996
 997 exit:
 998   if (a_norm != a_stub)
 999     free (a_norm);
1000   if (b_norm != b_stub)
1001     free (b_norm);
1002   return result;
1003 }
1004
1005 static char *
1006 utf8_casemap (const char *s,
1007               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
1008                              uint8_t *, size_t *))
1009 {
1010   char *result;
1011   size_t size;
1012
1013   result = CHAR_CAST (char *,
1014                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
1015                          NULL, NULL, NULL, &size));
1016   if (result == NULL)
1017     {
1018       if (errno == ENOMEM)
1019         xalloc_die ();
1020
1021       result = xstrdup (s);
1022     }
1023   return result;
1024 }
1025
1026 char *
1027 utf8_to_upper (const char *s)
1028 {
1029   return utf8_casemap (s, u8_toupper);
1030 }
1031
1032 char *
1033 utf8_to_lower (const char *s)
1034 {
1035   return utf8_casemap (s, u8_tolower);
1036 }
1037
1038 char *
1039 utf8_to_title (const char *s)
1040 {
1041   return utf8_casemap (s, u8_totitle);
1042 }
1043 \f
1044 bool
1045 get_encoding_info (struct encoding_info *e, const char *name)
1046 {
1047   const struct substring in = SS_LITERAL_INITIALIZER (
1048                                                       "\t\n\v\f\r "
1049                                                       "!\"#$%&'()*+,-./0123456789:;<=>?@"
1050                                                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
1051                                                       "abcdefghijklmnopqrstuvwxyz{|}~");
1052
1053   struct substring out, cr, lf, space;
1054   bool ok;
1055
1056   memset (e, 0, sizeof *e);
1057
1058   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
1059   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
1060   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
1061   ok = (cr.length >= 1
1062         && cr.length <= MAX_UNIT
1063         && cr.length == lf.length
1064         && cr.length == space.length);
1065   if (!ok)
1066     {
1067       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1068       ss_dealloc (&cr);
1069       ss_dealloc (&lf);
1070       ss_dealloc (&space);
1071       ss_alloc_substring (&cr, ss_cstr ("\r"));
1072       ss_alloc_substring (&lf, ss_cstr ("\n"));
1073       ss_alloc_substring (&space, ss_cstr (" "));
1074     }
1075
1076   e->unit = cr.length;
1077   memcpy (e->cr, cr.string, e->unit);
1078   memcpy (e->lf, lf.string, e->unit);
1079   memcpy (e->space, space.string, e->unit);
1080
1081   ss_dealloc (&cr);
1082   ss_dealloc (&lf);
1083   ss_dealloc (&space);
1084
1085   out = recode_substring_pool ("UTF-8", name, in, NULL);
1086   e->is_ascii_compatible = ss_equals (in, out);
1087   ss_dealloc (&out);
1088
1089   if (!e->is_ascii_compatible && e->unit == 1)
1090     {
1091       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1092       e->is_ebcdic_compatible = (out.length == 1
1093                                  && (uint8_t) out.string[0] == 0xc1);
1094       ss_dealloc (&out);
1095     }
1096   else
1097     e->is_ebcdic_compatible = false;
1098
1099   return ok;
1100 }
1101
1102 bool
1103 is_encoding_ascii_compatible (const char *encoding)
1104 {
1105   struct encoding_info e;
1106
1107   get_encoding_info (&e, encoding);
1108   return e.is_ascii_compatible;
1109 }
1110
1111 bool
1112 is_encoding_ebcdic_compatible (const char *encoding)
1113 {
1114   struct encoding_info e;
1115
1116   get_encoding_info (&e, encoding);
1117   return e.is_ebcdic_compatible;
1118 }
1119
1120 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1121    otherwise false. */
1122 bool
1123 is_encoding_supported (const char *encoding)
1124 {
1125   return (create_iconv ("UTF-8", encoding, false)
1126           && create_iconv (encoding, "UTF-8", false));
1127 }
1128
1129 /* Returns true if E is the name of a UTF-8 encoding.
1130
1131    XXX Possibly we should test not E as a string but its properties via
1132    iconv. */
1133 bool
1134 is_encoding_utf8 (const char *e)
1135 {
1136   return ((e[0] == 'u' || e[0] == 'U')
1137           && (e[1] == 't' || e[1] == 'T')
1138           && (e[2] == 'f' || e[2] == 'F')
1139           && ((e[3] == '8' && e[4] == '\0')
1140               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1141 }
1142 \f
1143 static struct encoding_category *categories;
1144 static int n_categories;
1145
1146 static void SENTINEL (0)
1147   add_category (size_t *allocated_categories, const char *category, ...)
1148 {
1149   struct encoding_category *c;
1150   const char *encodings[16];
1151   va_list args;
1152   int i, n;
1153
1154   /* Count encoding arguments. */
1155   va_start (args, category);
1156   n = 0;
1157   while ((encodings[n] = va_arg (args, const char *)) != NULL)
1158     {
1159       const char *encoding = encodings[n];
1160       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1161         n++;
1162     }
1163   assert (n < sizeof encodings / sizeof *encodings);
1164   va_end (args);
1165
1166   if (n == 0)
1167     return;
1168
1169   if (n_categories >= *allocated_categories)
1170     categories = x2nrealloc (categories,
1171                              allocated_categories, sizeof *categories);
1172
1173   c = &categories[n_categories++];
1174   c->category = category;
1175   c->encodings = xmalloc (n * sizeof *c->encodings);
1176   for (i = 0; i < n; i++)
1177     c->encodings[i] = encodings[i];
1178   c->n_encodings = n;
1179 }
1180
1181 static void
1182 init_encoding_categories (void)
1183 {
1184   static bool inited;
1185   size_t alloc;
1186
1187   if (inited)
1188     return;
1189   inited = true;
1190
1191   alloc = 0;
1192   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1193                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1194   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1195                 NULL_SENTINEL);
1196   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1197   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1198                 "Windows-1257", NULL_SENTINEL);
1199   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1200   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1201                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1202   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1203                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1204   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1205                 "EUC-TW", NULL_SENTINEL);
1206   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1207   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1208                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1209   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1210   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1211                 NULL_SENTINEL);
1212   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1213   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1214   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1215   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1216   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1217                 NULL_SENTINEL);
1218   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1219   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1220   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1221   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1222                 NULL_SENTINEL);
1223   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1224                 NULL_SENTINEL);
1225   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1226   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1227                 NULL_SENTINEL);
1228   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1229   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1230                 NULL_SENTINEL);
1231   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1232                 NULL_SENTINEL);
1233   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1234                 "Windows-1258", NULL_SENTINEL);
1235   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1236                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1237 }
1238
1239 /* Returns an array of "struct encoding_category" that contains only the
1240    categories and encodings that the system supports. */
1241 struct encoding_category *
1242 get_encoding_categories (void)
1243 {
1244   init_encoding_categories ();
1245   return categories;
1246 }
1247
1248 /* Returns the number of elements in the array returned by
1249    get_encoding_categories().  */
1250 size_t
1251 get_n_encoding_categories (void)
1252 {
1253   init_encoding_categories ();
1254   return n_categories;
1255 }