pintos-os.org Git - pspp/blob - src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <locale.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <unicase.h>
  30 #include <unigbrk.h>
  31
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/compiler.h"
  34 #include "libpspp/hmapx.h"
  35 #include "libpspp/hash-functions.h"
  36 #include "libpspp/pool.h"
  37 #include "libpspp/str.h"
  38 #include "libpspp/version.h"
  39
  40 #include "gl/c-strcase.h"
  41 #include "gl/localcharset.h"
  42 #include "gl/minmax.h"
  43 #include "gl/xalloc.h"
  44 #include "gl/relocatable.h"
  45 #include "gl/xstrndup.h"
  46
  47 #include "gettext.h"
  48 #define _(msgid) gettext (msgid)
  49
  50 struct converter
  51 {
  52   char *tocode;
  53   char *fromcode;
  54   iconv_t conv;
  55   int null_char_width;
  56 };
  57
  58 static char *default_encoding;
  59 static struct hmapx map;
  60
  61 /* A wrapper around iconv_open */
  62 static struct converter *
  63 create_iconv (const char* tocode, const char* fromcode)
  64 {
  65   size_t hash;
  66   struct hmapx_node *node;
  67   struct converter *converter;
  68   assert (fromcode);
  69
  70   hash = hash_string (tocode, hash_string (fromcode, 0));
  71   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  72     {
  73       if (!converter)
  74         return NULL;
  75
  76       if (!strcmp (tocode, converter->tocode)
  77           && !strcmp (fromcode, converter->fromcode))
  78         return converter;
  79     }
  80
  81   converter = xmalloc (sizeof *converter);
  82   converter->tocode = xstrdup (tocode);
  83   converter->fromcode = xstrdup (fromcode);
  84   converter->conv = iconv_open (tocode, fromcode);
  85   int error = converter->conv == (iconv_t) -1 ? errno : 0;
  86   /* I don't think it's safe to translate this string or to use messaging
  87      as the converters have not yet been set up */
  88   if (error && strcmp (tocode, fromcode))
  89     {
  90       fprintf (stderr,
  91                "Warning: "
  92                "cannot create a converter for `%s' to `%s': %s\n",
  93                fromcode, tocode, strerror (error));
  94
  95       hmapx_insert (&map, NULL, hash);
  96       return NULL;
  97     }
  98
  99   /* Find out how many bytes there are in a null char in the target
 100      encoding */
 101   iconv_t bconv = iconv_open (tocode, "ASCII");
 102   if (bconv != (iconv_t) -1)
 103     {
 104       ICONV_CONST  char *nullstr = strdup ("");
 105       ICONV_CONST  char *outbuf = strdup ("XXXXXXXX");
 106       ICONV_CONST  char *snullstr = nullstr;
 107       ICONV_CONST  char *soutbuf = outbuf;
 108
 109       size_t inbytes = 1;
 110       const size_t bytes = 8;
 111       size_t outbytes = bytes;
 112       if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes))
 113         converter->null_char_width = bytes - outbytes;
 114       free (snullstr);
 115       free (soutbuf);
 116       iconv_close (bconv);
 117     }
 118
 119   hmapx_insert (&map, converter, hash);
 120
 121   return converter;
 122 }
 123
 124
 125 /* Converts the single byte C from encoding FROM to TO, returning the first
 126    byte of the result.
 127
 128    This function probably shouldn't be used at all, but some code still does
 129    use it. */
 130 char
 131 recode_byte (const char *to, const char *from, char c)
 132 {
 133   char x;
 134   char *s = recode_string (to, from, &c, 1);
 135   x = s[0];
 136   free (s);
 137   return x;
 138 }
 139
 140 /* Similar to recode_string_pool, but allocates the returned value on the heap
 141    instead of in a pool.  It is the caller's responsibility to free the
 142    returned value. */
 143 char *
 144 recode_string (const char *to, const char *from,
 145                const char *text, int length)
 146 {
 147   return recode_string_pool (to, from, text, length, NULL);
 148 }
 149
 150 /* Returns the length, in bytes, of the string that a similar recode_string()
 151    call would return. */
 152 size_t
 153 recode_string_len (const char *to, const char *from,
 154                    const char *text, int length)
 155 {
 156   char *s = recode_string (to, from, text, length);
 157   size_t len = strlen (s);
 158   free (s);
 159   return len;
 160 }
 161
 162 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 163    at OP, and appends a null terminator to the output.
 164
 165    Returns the output length if successful, -1 if the output buffer is too
 166    small. */
 167 static ssize_t
 168 try_recode (struct converter *cvtr, char fallbackchar,
 169             const char *in, size_t inbytes,
 170             char *out_, size_t outbytes)
 171 {
 172   char *out = out_;
 173   int i, j;
 174
 175   int null_bytes = cvtr->null_char_width;
 176
 177   /* Put the converter into the initial shift state, in case there was any
 178      state information left over from its last usage. */
 179   iconv (cvtr->conv, NULL, 0, NULL, 0);
 180
 181   /* Do two rounds of iconv() calls:
 182
 183      - The first round does the bulk of the conversion using the
 184      caller-supplied input data..
 185
 186      - The second round flushes any leftover output.  This has a real effect
 187      with input encodings that use combining diacritics, e.g. without the
 188      second round the last character tends to gets dropped when converting
 189      from windows-1258 to other encodings.
 190   */
 191   for (i = 0; i < 2; i++)
 192     {
 193       ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) &in;
 194       size_t *inbytesp = i ? NULL : &inbytes;
 195
 196       while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
 197         switch (errno)
 198           {
 199           case EINVAL:
 200             if (outbytes < null_bytes + 1)
 201               return -E2BIG;
 202             if (!fallbackchar)
 203               return -EINVAL;
 204             *out++ = fallbackchar;
 205             for (j = 0 ; j < null_bytes ; ++j)
 206               *out++ = '\0';
 207             return out - 1 - out_;
 208
 209           case EILSEQ:
 210             if (outbytes == 0)
 211               return -E2BIG;
 212             if (!fallbackchar)
 213               return -EILSEQ;
 214             *out++ = fallbackchar;
 215             outbytes--;
 216             if (inp)
 217               {
 218                 in++;
 219                 inbytes--;
 220               }
 221             break;
 222
 223           case E2BIG:
 224             return -E2BIG;
 225
 226           default:
 227             /* should never happen */
 228             fprintf (stderr, "Character conversion error: %s\n",
 229                      strerror (errno));
 230             NOT_REACHED ();
 231             break;
 232           }
 233     }
 234
 235   if (outbytes <= null_bytes - 1)
 236     return -E2BIG;
 237
 238   for (i = 0 ; i < null_bytes ; ++i)
 239     *out++ = '\0';
 240
 241   return out - 1 - out_;
 242 }
 243
 244 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 245    dynamically allocated string in TO-encoding.  Any characters which cannot be
 246    converted will be represented by '?'.
 247
 248    LENGTH should be the length of the string or -1, if null terminated.
 249
 250    The returned string will be allocated on POOL.
 251
 252    This function's behaviour differs from that of g_convert_with_fallback
 253    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 254    the input string is not valid in the declared input encoding.  This function
 255    however perseveres even in the presence of badly encoded input. */
 256 char *
 257 recode_string_pool (const char *to, const char *from,
 258                     const char *text, int length, struct pool *pool)
 259 {
 260   struct substring out;
 261
 262   if ( text == NULL )
 263     return NULL;
 264
 265   if ( length == -1 )
 266     length = strlen (text);
 267
 268   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 269   return out.string;
 270 }
 271
 272 /* Returns the name of the encoding that should be used for file names.
 273
 274    This is meant to be the same encoding used by g_filename_from_uri() and
 275    g_filename_to_uri() in GLib. */
 276 static const char *
 277 filename_encoding (void)
 278 {
 279 #if defined _WIN32 || defined __WIN32__
 280   return "UTF-8";
 281 #else
 282   return locale_charset ();
 283 #endif
 284 }
 285
 286 static char *
 287 xconcat2 (const char *a, size_t a_len,
 288           const char *b, size_t b_len)
 289 {
 290   char *s = xmalloc (a_len + b_len + 1);
 291   memcpy (s, a, a_len);
 292   memcpy (s + a_len, b, b_len);
 293   s[a_len + b_len] = '\0';
 294   return s;
 295 }
 296
 297 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 298    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 299    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 300    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 301    HEAD and tries again, repeating as necessary until the concatenated result
 302    fits or until HEAD_LEN reaches 0.
 303
 304    [*] Actually this function drops grapheme clusters instead of characters, so
 305    that, e.g. a Unicode character followed by a combining accent character
 306    is either completely included or completely excluded from HEAD_LEN.  See
 307    UAX #29 at http://unicode.org/reports/tr29/ for more information on
 308    grapheme clusters.
 309
 310    A null ENCODING is treated as UTF-8.
 311
 312    Sometimes this function has to actually construct the concatenated string to
 313    measure its length.  When this happens, it sets *RESULTP to that
 314    null-terminated string, allocated with malloc(), for the caller to use if it
 315    needs it.  Otherwise, it sets *RESULTP to NULL.
 316
 317    Simple examples for encoding="UTF-8", max_len=6:
 318
 319    head="abc",  tail="xyz"     => 3
 320    head="abcd", tail="xyz"     => 3 ("d" dropped).
 321    head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 322    head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 323
 324    Examples for encoding="ISO-8859-1", max_len=6:
 325
 326    head="éèä",  tail="xyz"     => 6
 327    (each letter in head is only 1 byte in ISO-8859-1 even though they
 328    each take 2 bytes in UTF-8 encoding)
 329 */
 330 static size_t
 331 utf8_encoding_concat__ (const char *head, size_t head_len,
 332                         const char *tail, size_t tail_len,
 333                         const char *encoding, size_t max_len,
 334                         char **resultp)
 335 {
 336   *resultp = NULL;
 337   if (head_len == 0)
 338     return 0;
 339   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 340     {
 341       if (head_len + tail_len <= max_len)
 342         return head_len;
 343       else if (tail_len >= max_len)
 344         return 0;
 345       else
 346         {
 347           size_t copy_len;
 348           ucs4_t prev;
 349           size_t ofs;
 350           int mblen;
 351
 352           copy_len = 0;
 353           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 354                                 head_len);
 355                ofs <= max_len - tail_len;
 356                ofs += mblen)
 357             {
 358               ucs4_t next;
 359
 360               mblen = u8_mbtouc (&next,
 361                                  CHAR_CAST (const uint8_t *, head + ofs),
 362                                  head_len - ofs);
 363               if (uc_is_grapheme_break (prev, next))
 364                 copy_len = ofs;
 365
 366               prev = next;
 367             }
 368
 369           return copy_len;
 370         }
 371     }
 372   else
 373     {
 374       char *result;
 375
 376       result = (tail_len > 0
 377                 ? xconcat2 (head, head_len, tail, tail_len)
 378                 : CONST_CAST (char *, head));
 379       if (recode_string_len (encoding, "UTF-8", result,
 380                              head_len + tail_len) <= max_len)
 381         {
 382           *resultp = result != head ? result : NULL;
 383           return head_len;
 384         }
 385       else
 386         {
 387           bool correct_result = false;
 388           size_t copy_len;
 389           ucs4_t prev;
 390           size_t ofs;
 391           int mblen;
 392
 393           copy_len = 0;
 394           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 395                                 head_len);
 396                ofs <= head_len;
 397                ofs += mblen)
 398             {
 399               ucs4_t next;
 400
 401               mblen = u8_mbtouc (&next,
 402                                  CHAR_CAST (const uint8_t *, head + ofs),
 403                                  head_len - ofs);
 404               if (uc_is_grapheme_break (prev, next))
 405                 {
 406                   if (result != head)
 407                     {
 408                       memcpy (result, head, ofs);
 409                       memcpy (result + ofs, tail, tail_len);
 410                       result[ofs + tail_len] = '\0';
 411                     }
 412
 413                   if (recode_string_len (encoding, "UTF-8", result,
 414                                          ofs + tail_len) <= max_len)
 415                     {
 416                       correct_result = true;
 417                       copy_len = ofs;
 418                     }
 419                   else
 420                     correct_result = false;
 421                 }
 422
 423               prev = next;
 424             }
 425
 426           if (result != head)
 427             {
 428               if (correct_result)
 429                 *resultp = result;
 430               else
 431                 free (result);
 432             }
 433
 434           return copy_len;
 435         }
 436     }
 437 }
 438
 439 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 440    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 441    string are all encoded in UTF-8.  As many characters[*] from the beginning
 442    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 443    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 444    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 445
 446    [*] Actually this function drops grapheme clusters instead of characters, so
 447    that, e.g. a Unicode character followed by a combining accent character
 448    is either completely included or completely excluded from the returned
 449    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 450    information on grapheme clusters.
 451
 452    A null ENCODING is treated as UTF-8.
 453
 454    Simple examples for encoding="UTF-8", max_len=6:
 455
 456    head="abc",  tail="xyz"     => "abcxyz"
 457    head="abcd", tail="xyz"     => "abcxyz"
 458    head="abc",  tail="uvwxyz"  => "uvwxyz"
 459    head="abc",  tail="tuvwxyz" => "tuvwxyz"
 460
 461    Examples for encoding="ISO-8859-1", max_len=6:
 462
 463    head="éèä",  tail="xyz"    => "éèäxyz"
 464    (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 465    each take 2 bytes in UTF-8 encoding)
 466 */
 467 char *
 468 utf8_encoding_concat (const char *head, const char *tail,
 469                       const char *encoding, size_t max_len)
 470 {
 471   size_t tail_len = strlen (tail);
 472   size_t prefix_len;
 473   char *result;
 474
 475   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 476                                        encoding, max_len, &result);
 477   return (result != NULL
 478           ? result
 479           : xconcat2 (head, prefix_len, tail, tail_len));
 480 }
 481
 482 /* Returns the length, in bytes, of the string that would be returned by
 483    utf8_encoding_concat() if passed the same arguments, but the implementation
 484    is often more efficient. */
 485 size_t
 486 utf8_encoding_concat_len (const char *head, const char *tail,
 487                           const char *encoding, size_t max_len)
 488 {
 489   size_t tail_len = strlen (tail);
 490   size_t prefix_len;
 491   char *result;
 492
 493   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 494                                        encoding, max_len, &result);
 495   free (result);
 496   return prefix_len + tail_len;
 497 }
 498
 499 /* Returns an allocated, null-terminated string, owned by the caller,
 500    containing as many characters[*] from the beginning of S that would fit
 501    within MAX_LEN bytes if the returned string were to be re-encoded in
 502    ENCODING.  Both S and the returned string are encoded in UTF-8.
 503
 504    [*] Actually this function drops grapheme clusters instead of characters, so
 505    that, e.g. a Unicode character followed by a combining accent character
 506    is either completely included or completely excluded from the returned
 507    string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 508    information on grapheme clusters.
 509
 510    A null ENCODING is treated as UTF-8.
 511 */
 512 char *
 513 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 514 {
 515   return utf8_encoding_concat (s, "", encoding, max_len);
 516 }
 517
 518 /* Returns the length, in bytes, of the string that would be returned by
 519    utf8_encoding_trunc() if passed the same arguments, but the implementation
 520    is often more efficient. */
 521 size_t
 522 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 523 {
 524   return utf8_encoding_concat_len (s, "", encoding, max_len);
 525 }
 526
 527 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 528    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 529    current locale. */
 530 char *
 531 utf8_to_filename (const char *filename)
 532 {
 533   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 534 }
 535
 536 /* Returns FILENAME converted from the filename encoding to UTF-8.
 537    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 538    current locale. */
 539 char *
 540 filename_to_utf8 (const char *filename)
 541 {
 542   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 543 }
 544
 545 static int
 546 recode_substring_pool__ (const char *to, const char *from,
 547                          struct substring text, char fallbackchar,
 548                          struct pool *pool, struct substring *out)
 549 {
 550   size_t bufsize;
 551   struct converter *conv;
 552
 553   if (to == NULL)
 554     to = default_encoding;
 555
 556   if (from == NULL)
 557     from = default_encoding;
 558
 559   conv = create_iconv (to, from);
 560
 561   if ( NULL == conv )
 562     {
 563       if (fallbackchar)
 564         {
 565           out->string = pool_malloc (pool, text.length + 1);
 566           out->length = text.length;
 567           memcpy (out->string, text.string, text.length);
 568           out->string[out->length] = '\0';
 569           return 0;
 570         }
 571       else
 572         return EPROTO;
 573     }
 574
 575   for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
 576     {
 577       char *output = pool_malloc (pool, bufsize);
 578       ssize_t retval;
 579
 580       retval = try_recode (conv, fallbackchar, text.string, text.length,
 581                            output, bufsize);
 582       if (retval >= 0)
 583         {
 584           *out = ss_buffer (output, retval);
 585           return 0;
 586         }
 587       pool_free (pool, output);
 588
 589       if (retval != -E2BIG)
 590         return -retval;
 591     }
 592
 593   NOT_REACHED ();
 594 }
 595
 596 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 597    dynamically allocated string in TO-encoding.  Any characters which cannot be
 598    converted will be represented by '?'.
 599
 600    The returned string will be null-terminated and allocated on POOL with
 601    pool_malloc().
 602
 603    This function's behaviour differs from that of g_convert_with_fallback
 604    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 605    the input string is not valid in the declared input encoding.  This function
 606    however perseveres even in the presence of badly encoded input. */
 607 struct substring
 608 recode_substring_pool (const char *to, const char *from,
 609                        struct substring text, struct pool *pool)
 610 {
 611   struct substring out;
 612
 613   recode_substring_pool__ (to, from, text, '?', pool, &out);
 614   return out;
 615 }
 616
 617 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 618    dynamically allocated string in TO-encoding.  On success, returns 0, and the
 619    converted null-terminated string, allocated from POOL with pool_malloc(), is
 620    stored in *OUT.  On failure, returns a positive errno value.
 621
 622    The function fails with an error if any part of the input string is not
 623    valid in the declared input encoding. */
 624 int
 625 recode_pedantically (const char *to, const char *from,
 626                      struct substring text, struct pool *pool,
 627                      struct substring *out)
 628 {
 629   int error;
 630
 631   error = recode_substring_pool__ (to, from, text, 0, pool, out);
 632   if (error)
 633     *out = ss_empty ();
 634   return error;
 635 }
 636 \f
 637 void
 638 i18n_init (void)
 639 {
 640   setlocale (LC_ALL, "");
 641   bindtextdomain (PACKAGE, relocate(locale_dir));
 642   textdomain (PACKAGE);
 643
 644   assert (default_encoding == NULL);
 645   default_encoding = xstrdup (locale_charset ());
 646
 647   hmapx_init (&map);
 648 }
 649
 650 const char *
 651 get_default_encoding (void)
 652 {
 653   return default_encoding;
 654 }
 655
 656 void
 657 set_default_encoding (const char *enc)
 658 {
 659   free (default_encoding);
 660   default_encoding = xstrdup (enc);
 661 }
 662
 663
 664 /* Attempts to set the encoding from a locale name
 665    returns true if successfull.
 666    This function does not (should not!) alter the current locale.
 667 */
 668 bool
 669 set_encoding_from_locale (const char *loc)
 670 {
 671   bool ok = true;
 672   char *c_encoding;
 673   char *loc_encoding;
 674   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 675
 676   setlocale (LC_CTYPE, "C");
 677   c_encoding = xstrdup (locale_charset ());
 678
 679   setlocale (LC_CTYPE, loc);
 680   loc_encoding = xstrdup (locale_charset ());
 681
 682
 683   if ( 0 == strcmp (loc_encoding, c_encoding))
 684     {
 685       ok = false;
 686     }
 687
 688   setlocale (LC_CTYPE, tmp);
 689
 690   free (tmp);
 691
 692   if (ok)
 693     {
 694       free (default_encoding);
 695       default_encoding = loc_encoding;
 696     }
 697   else
 698     free (loc_encoding);
 699
 700   free (c_encoding);
 701
 702   return ok;
 703 }
 704
 705 void
 706 i18n_done (void)
 707 {
 708   struct hmapx_node *node;
 709   struct converter *cvtr;
 710
 711   HMAPX_FOR_EACH (cvtr, node, &map)
 712     {
 713       if (cvtr == NULL)
 714         continue;
 715       free (cvtr->tocode);
 716       free (cvtr->fromcode);
 717       if (cvtr->conv != (iconv_t) -1)
 718         iconv_close (cvtr->conv);
 719       free (cvtr);
 720     }
 721
 722   hmapx_destroy (&map);
 723
 724   free (default_encoding);
 725   default_encoding = NULL;
 726 }
 727
 728
 729
 730 bool
 731 valid_encoding (const char *enc)
 732 {
 733   iconv_t conv = iconv_open (UTF8, enc);
 734
 735   if ( conv == (iconv_t) -1)
 736     return false;
 737
 738   iconv_close (conv);
 739
 740   return true;
 741 }
 742
 743
 744 /* Return the system local's idea of the
 745    decimal seperator character */
 746 char
 747 get_system_decimal (void)
 748 {
 749   char radix_char;
 750
 751 #if HAVE_NL_LANGINFO
 752   radix_char = nl_langinfo (RADIXCHAR)[0];
 753 #else
 754   {
 755     char buf[10];
 756     snprintf (buf, sizeof buf, "%f", 2.5);
 757     radix_char = buf[1];
 758   }
 759 #endif
 760
 761   return radix_char;
 762 }
 763
 764 const char *
 765 uc_name (ucs4_t uc, char buffer[16])
 766 {
 767   if (uc >= 0x20 && uc < 0x7f)
 768     snprintf (buffer, 16, "`%c'", uc);
 769   else
 770     snprintf (buffer, 16, "U+%04X", uc);
 771   return buffer;
 772 }
 773 \f
 774 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
 775
 776 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
 777    with lowercase and uppercase letters treated as equal, starting from
 778    BASIS. */
 779 unsigned int
 780 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
 781 {
 782   uint8_t folded_buf[2048];
 783   size_t folded_len = sizeof folded_buf;
 784   uint8_t *folded_s;
 785   unsigned int hash;
 786
 787   folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
 788                           NULL, UNINORM_NFKD, folded_buf, &folded_len);
 789   if (folded_s != NULL)
 790     {
 791       hash = hash_bytes (folded_s, folded_len, basis);
 792       if (folded_s != folded_buf)
 793         free (folded_s);
 794     }
 795   else
 796     {
 797       if (errno == ENOMEM)
 798         xalloc_die ();
 799       hash = hash_bytes (s, n, basis);
 800     }
 801
 802   return hash;
 803 }
 804
 805 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
 806    uppercase letters treated as equal, starting from BASIS. */
 807 unsigned int
 808 utf8_hash_case_string (const char *s, unsigned int basis)
 809 {
 810   return utf8_hash_case_bytes (s, strlen (s), basis);
 811 }
 812
 813 /* Compares UTF-8 strings A and B case-insensitively.
 814    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 815 int
 816 utf8_strcasecmp (const char *a, const char *b)
 817 {
 818   return utf8_strncasecmp (a, strlen (a), b, strlen (b));
 819 }
 820
 821 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
 822    case-insensitively.
 823    Returns a negative value if A < B, zero if A == B, positive if A > B. */
 824 int
 825 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
 826 {
 827   int result;
 828
 829   if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
 830                   CHAR_CAST (const uint8_t *, b), bn,
 831                   NULL, UNINORM_NFKD, &result))
 832     {
 833       if (errno == ENOMEM)
 834         xalloc_die ();
 835
 836       result = memcmp (a, b, MIN (an, bn));
 837       if (result == 0)
 838         result = an < bn ? -1 : an > bn;
 839     }
 840
 841   return result;
 842 }
 843
 844 static char *
 845 utf8_casemap (const char *s,
 846               uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
 847                              uint8_t *, size_t *))
 848 {
 849   char *result;
 850   size_t size;
 851
 852   result = CHAR_CAST (char *,
 853                       f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
 854                          NULL, NULL, NULL, &size));
 855   if (result == NULL)
 856     {
 857       if (errno == ENOMEM)
 858         xalloc_die ();
 859
 860       result = xstrdup (s);
 861     }
 862   return result;
 863 }
 864
 865 char *
 866 utf8_to_upper (const char *s)
 867 {
 868   return utf8_casemap (s, u8_toupper);
 869 }
 870
 871 char *
 872 utf8_to_lower (const char *s)
 873 {
 874   return utf8_casemap (s, u8_tolower);
 875 }
 876 \f
 877 bool
 878 get_encoding_info (struct encoding_info *e, const char *name)
 879 {
 880   const struct substring in = SS_LITERAL_INITIALIZER (
 881                                                       "\t\n\v\f\r "
 882                                                       "!\"#$%&'()*+,-./0123456789:;<=>?@"
 883                                                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 884                                                       "abcdefghijklmnopqrstuvwxyz{|}~");
 885
 886   struct substring out, cr, lf, space;
 887   bool ok;
 888
 889   memset (e, 0, sizeof *e);
 890
 891   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 892   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 893   space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
 894   ok = (cr.length >= 1
 895         && cr.length <= MAX_UNIT
 896         && cr.length == lf.length
 897         && cr.length == space.length);
 898   if (!ok)
 899     {
 900       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 901       ss_dealloc (&cr);
 902       ss_dealloc (&lf);
 903       ss_dealloc (&space);
 904       ss_alloc_substring (&cr, ss_cstr ("\r"));
 905       ss_alloc_substring (&lf, ss_cstr ("\n"));
 906       ss_alloc_substring (&space, ss_cstr (" "));
 907     }
 908
 909   e->unit = cr.length;
 910   memcpy (e->cr, cr.string, e->unit);
 911   memcpy (e->lf, lf.string, e->unit);
 912   memcpy (e->space, space.string, e->unit);
 913
 914   ss_dealloc (&cr);
 915   ss_dealloc (&lf);
 916   ss_dealloc (&space);
 917
 918   out = recode_substring_pool ("UTF-8", name, in, NULL);
 919   e->is_ascii_compatible = ss_equals (in, out);
 920   ss_dealloc (&out);
 921
 922   if (!e->is_ascii_compatible && e->unit == 1)
 923     {
 924       out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
 925       e->is_ebcdic_compatible = (out.length == 1
 926                                  && (uint8_t) out.string[0] == 0xc1);
 927       ss_dealloc (&out);
 928     }
 929   else
 930     e->is_ebcdic_compatible = false;
 931
 932   return ok;
 933 }
 934
 935 bool
 936 is_encoding_ascii_compatible (const char *encoding)
 937 {
 938   struct encoding_info e;
 939
 940   get_encoding_info (&e, encoding);
 941   return e.is_ascii_compatible;
 942 }
 943
 944 bool
 945 is_encoding_ebcdic_compatible (const char *encoding)
 946 {
 947   struct encoding_info e;
 948
 949   get_encoding_info (&e, encoding);
 950   return e.is_ebcdic_compatible;
 951 }
 952
 953 /* Returns true if iconv can convert ENCODING to and from UTF-8,
 954    otherwise false. */
 955 bool
 956 is_encoding_supported (const char *encoding)
 957 {
 958   return (create_iconv ("UTF-8", encoding)
 959           && create_iconv (encoding, "UTF-8"));
 960 }
 961
 962 /* Returns true if E is the name of a UTF-8 encoding.
 963
 964    XXX Possibly we should test not E as a string but its properties via
 965    iconv. */
 966 bool
 967 is_encoding_utf8 (const char *e)
 968 {
 969   return ((e[0] == 'u' || e[0] == 'U')
 970           && (e[1] == 't' || e[1] == 'T')
 971           && (e[2] == 'f' || e[2] == 'F')
 972           && ((e[3] == '8' && e[4] == '\0')
 973               || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
 974 }
 975 \f
 976 static struct encoding_category *categories;
 977 static int n_categories;
 978
 979 static void SENTINEL (0)
 980   add_category (size_t *allocated_categories, const char *category, ...)
 981 {
 982   struct encoding_category *c;
 983   const char *encodings[16];
 984   va_list args;
 985   int i, n;
 986
 987   /* Count encoding arguments. */
 988   va_start (args, category);
 989   n = 0;
 990   while ((encodings[n] = va_arg (args, const char *)) != NULL)
 991     {
 992       const char *encoding = encodings[n];
 993       if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
 994         n++;
 995     }
 996   assert (n < sizeof encodings / sizeof *encodings);
 997   va_end (args);
 998
 999   if (n == 0)
1000     return;
1001
1002   if (n_categories >= *allocated_categories)
1003     categories = x2nrealloc (categories,
1004                              allocated_categories, sizeof *categories);
1005
1006   c = &categories[n_categories++];
1007   c->category = category;
1008   c->encodings = xmalloc (n * sizeof *c->encodings);
1009   for (i = 0; i < n; i++)
1010     c->encodings[i] = encodings[i];
1011   c->n_encodings = n;
1012 }
1013
1014 static void
1015 init_encoding_categories (void)
1016 {
1017   static bool inited;
1018   size_t alloc;
1019
1020   if (inited)
1021     return;
1022   inited = true;
1023
1024   alloc = 0;
1025   add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1026                 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1027   add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1028                 NULL_SENTINEL);
1029   add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1030   add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1031                 "Windows-1257", NULL_SENTINEL);
1032   add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1033   add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1034                 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1035   add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1036                 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1037   add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1038                 "EUC-TW", NULL_SENTINEL);
1039   add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1040   add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1041                 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1042   add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1043   add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1044                 NULL_SENTINEL);
1045   add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1046   add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1047   add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1048   add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1049   add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1050                 NULL_SENTINEL);
1051   add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1052   add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1053   add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1054   add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1055                 NULL_SENTINEL);
1056   add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1057                 NULL_SENTINEL);
1058   add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1059   add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1060                 NULL_SENTINEL);
1061   add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1062   add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1063                 NULL_SENTINEL);
1064   add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1065                 NULL_SENTINEL);
1066   add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1067                 "Windows-1258", NULL_SENTINEL);
1068   add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1069                 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1070 }
1071
1072 /* Returns an array of "struct encoding_category" that contains only the
1073    categories and encodings that the system supports. */
1074 struct encoding_category *
1075 get_encoding_categories (void)
1076 {
1077   init_encoding_categories ();
1078   return categories;
1079 }
1080
1081 /* Returns the number of elements in the array returned by
1082    get_encoding_categories().  */
1083 size_t
1084 get_n_encoding_categories (void)
1085 {
1086   init_encoding_categories ();
1087   return n_categories;
1088 }