src/libpspp/i18n.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/i18n.h"
  20
  21 #include <assert.h>
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #include <langinfo.h>
  25 #include <libintl.h>
  26 #include <locale.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unigbrk.h>
  31
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/hmapx.h"
  34 #include "libpspp/hash-functions.h"
  35 #include "libpspp/pool.h"
  36 #include "libpspp/str.h"
  37 #include "libpspp/version.h"
  38
  39 #include "gl/c-strcase.h"
  40 #include "gl/localcharset.h"
  41 #include "gl/xalloc.h"
  42 #include "gl/relocatable.h"
  43 #include "gl/xstrndup.h"
  44
  45 struct converter
  46  {
  47     char *tocode;
  48     char *fromcode;
  49     iconv_t conv;
  50   };
  51
  52 static char *default_encoding;
  53 static struct hmapx map;
  54
  55 /* A wrapper around iconv_open */
  56 static iconv_t
  57 create_iconv (const char* tocode, const char* fromcode)
  58 {
  59   size_t hash;
  60   struct hmapx_node *node;
  61   struct converter *converter;
  62   assert (fromcode);
  63
  64   hash = hash_string (tocode, hash_string (fromcode, 0));
  65   HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
  66     if (!strcmp (tocode, converter->tocode)
  67         && !strcmp (fromcode, converter->fromcode))
  68       return converter->conv;
  69
  70   converter = xmalloc (sizeof *converter);
  71   converter->tocode = xstrdup (tocode);
  72   converter->fromcode = xstrdup (fromcode);
  73   converter->conv = iconv_open (tocode, fromcode);
  74   hmapx_insert (&map, converter, hash);
  75
  76   /* I don't think it's safe to translate this string or to use messaging
  77      as the converters have not yet been set up */
  78   if ( (iconv_t) -1 == converter->conv && 0 != strcmp (tocode, fromcode))
  79     {
  80       const int err = errno;
  81       fprintf (stderr,
  82                "Warning: "
  83                "cannot create a converter for `%s' to `%s': %s\n",
  84                fromcode, tocode, strerror (err));
  85     }
  86
  87   return converter->conv;
  88 }
  89
  90 /* Converts the single byte C from encoding FROM to TO, returning the first
  91    byte of the result.
  92
  93    This function probably shouldn't be used at all, but some code still does
  94    use it. */
  95 char
  96 recode_byte (const char *to, const char *from, char c)
  97 {
  98   char x;
  99   char *s = recode_string (to, from, &c, 1);
 100   x = s[0];
 101   free (s);
 102   return x;
 103 }
 104
 105 /* Similar to recode_string_pool, but allocates the returned value on the heap
 106    instead of in a pool.  It is the caller's responsibility to free the
 107    returned value. */
 108 char *
 109 recode_string (const char *to, const char *from,
 110                const char *text, int length)
 111 {
 112   return recode_string_pool (to, from, text, length, NULL);
 113 }
 114
 115 /* Returns the length, in bytes, of the string that a similar recode_string()
 116    call would return. */
 117 size_t
 118 recode_string_len (const char *to, const char *from,
 119                    const char *text, int length)
 120 {
 121   char *s = recode_string (to, from, text, length);
 122   size_t len = strlen (s);
 123   free (s);
 124   return len;
 125 }
 126
 127 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
 128    at OP, and appends a null terminator to the output.
 129
 130    Returns the output length if successful, -1 if the output buffer is too
 131    small. */
 132 static ssize_t
 133 try_recode (iconv_t conv,
 134             const char *ip, size_t inbytes,
 135             char *op_, size_t outbytes)
 136 {
 137   /* FIXME: Need to ensure that this char is valid in the target encoding */
 138   const char fallbackchar = '?';
 139   char *op = op_;
 140
 141   /* Put the converter into the initial shift state, in case there was any
 142      state information left over from its last usage. */
 143   iconv (conv, NULL, 0, NULL, 0);
 144
 145   while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
 146                 &op, &outbytes) == -1)
 147     switch (errno)
 148       {
 149       case EINVAL:
 150         if (outbytes < 2)
 151           return -1;
 152         *op++ = fallbackchar;
 153         *op = '\0';
 154         return op - op_;
 155
 156       case EILSEQ:
 157         if (outbytes == 0)
 158           return -1;
 159         *op++ = fallbackchar;
 160         outbytes--;
 161         ip++;
 162         inbytes--;
 163         break;
 164
 165       case E2BIG:
 166         return -1;
 167
 168       default:
 169         /* should never happen */
 170         fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
 171         NOT_REACHED ();
 172         break;
 173       }
 174
 175   if (outbytes == 0)
 176     return -1;
 177
 178   *op = '\0';
 179   return op - op_;
 180 }
 181
 182 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 183    dynamically allocated string in TO-encoding.  Any characters which cannot be
 184    converted will be represented by '?'.
 185
 186    LENGTH should be the length of the string or -1, if null terminated.
 187
 188    The returned string will be allocated on POOL.
 189
 190    This function's behaviour differs from that of g_convert_with_fallback
 191    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 192    the input string is not valid in the declared input encoding.  This function
 193    however perseveres even in the presence of badly encoded input. */
 194 char *
 195 recode_string_pool (const char *to, const char *from,
 196                     const char *text, int length, struct pool *pool)
 197 {
 198   struct substring out;
 199
 200   if ( text == NULL )
 201     return NULL;
 202
 203   if ( length == -1 )
 204      length = strlen (text);
 205
 206   out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
 207   return out.string;
 208 }
 209
 210 /* Returns the name of the encoding that should be used for file names.
 211
 212    This is meant to be the same encoding used by g_filename_from_uri() and
 213    g_filename_to_uri() in GLib. */
 214 static const char *
 215 filename_encoding (void)
 216 {
 217 #if defined _WIN32 || defined __WIN32__
 218   return "UTF-8";
 219 #else
 220   return locale_charset ();
 221 #endif
 222 }
 223
 224 static char *
 225 xconcat2 (const char *a, size_t a_len,
 226           const char *b, size_t b_len)
 227 {
 228   char *s = xmalloc (a_len + b_len + 1);
 229   memcpy (s, a, a_len);
 230   memcpy (s + a_len, b, b_len);
 231   s[a_len + b_len] = '\0';
 232   return s;
 233 }
 234
 235 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
 236    TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
 237    ENCODING.  If the re-encoded result is no more than MAX_LEN bytes long, then
 238    it returns HEAD_LEN.  Otherwise, it drops one character[*] from the end of
 239    HEAD and tries again, repeating as necessary until the concatenated result
 240    fits or until HEAD_LEN reaches 0.
 241
 242    [*] Actually this function drops grapheme clusters instead of characters, so
 243        that, e.g. a Unicode character followed by a combining accent character
 244        is either completely included or completely excluded from HEAD_LEN.  See
 245        UAX #29 at http://unicode.org/reports/tr29/ for more information on
 246        grapheme clusters.
 247
 248    A null ENCODING is treated as UTF-8.
 249
 250    Sometimes this function has to actually construct the concatenated string to
 251    measure its length.  When this happens, it sets *RESULTP to that
 252    null-terminated string, allocated with malloc(), for the caller to use if it
 253    needs it.  Otherwise, it sets *RESULTP to NULL.
 254
 255    Simple examples for encoding="UTF-8", max_len=6:
 256
 257        head="abc",  tail="xyz"     => 3
 258        head="abcd", tail="xyz"     => 3 ("d" dropped).
 259        head="abc",  tail="uvwxyz"  => 0 ("abc" dropped).
 260        head="abc",  tail="tuvwxyz" => 0 ("abc" dropped).
 261
 262    Examples for encoding="ISO-8859-1", max_len=6:
 263
 264        head="éèä",  tail="xyz"     => 6
 265          (each letter in head is only 1 byte in ISO-8859-1 even though they
 266           each take 2 bytes in UTF-8 encoding)
 267 */
 268 static size_t
 269 utf8_encoding_concat__ (const char *head, size_t head_len,
 270                         const char *tail, size_t tail_len,
 271                         const char *encoding, size_t max_len,
 272                         char **resultp)
 273 {
 274   *resultp = NULL;
 275   if (head_len == 0)
 276     return 0;
 277   else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
 278     {
 279       if (head_len + tail_len <= max_len)
 280         return head_len;
 281       else if (tail_len >= max_len)
 282         return 0;
 283       else
 284         {
 285           size_t copy_len;
 286           size_t prev;
 287           size_t ofs;
 288           int mblen;
 289
 290           copy_len = 0;
 291           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 292                                 head_len);
 293                ofs <= max_len - tail_len;
 294                ofs += mblen)
 295             {
 296               ucs4_t next;
 297
 298               mblen = u8_mbtouc (&next,
 299                                  CHAR_CAST (const uint8_t *, head + ofs),
 300                                  head_len - ofs);
 301               if (uc_is_grapheme_break (prev, next))
 302                 copy_len = ofs;
 303
 304               prev = next;
 305             }
 306
 307           return copy_len;
 308         }
 309     }
 310   else
 311     {
 312       char *result;
 313
 314       result = (tail_len > 0
 315                 ? xconcat2 (head, head_len, tail, tail_len)
 316                 : CONST_CAST (char *, head));
 317       if (recode_string_len (encoding, "UTF-8", result,
 318                              head_len + tail_len) <= max_len)
 319         {
 320           *resultp = result != head ? result : NULL;
 321           return head_len;
 322         }
 323       else
 324         {
 325           bool correct_result = false;
 326           size_t copy_len;
 327           size_t prev;
 328           size_t ofs;
 329           int mblen;
 330
 331           copy_len = 0;
 332           for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
 333                                 head_len);
 334                ofs <= head_len;
 335                ofs += mblen)
 336             {
 337               ucs4_t next;
 338
 339               mblen = u8_mbtouc (&next,
 340                                  CHAR_CAST (const uint8_t *, head + ofs),
 341                                  head_len - ofs);
 342               if (uc_is_grapheme_break (prev, next))
 343                 {
 344                   if (result != head)
 345                     {
 346                       memcpy (result, head, ofs);
 347                       memcpy (result + ofs, tail, tail_len);
 348                       result[ofs + tail_len] = '\0';
 349                     }
 350
 351                   if (recode_string_len (encoding, "UTF-8", result,
 352                                          ofs + tail_len) <= max_len)
 353                     {
 354                       correct_result = true;
 355                       copy_len = ofs;
 356                     }
 357                   else
 358                     correct_result = false;
 359                 }
 360
 361               prev = next;
 362             }
 363
 364           if (result != head)
 365             {
 366               if (correct_result)
 367                 *resultp = result;
 368               else
 369                 free (result);
 370             }
 371
 372           return copy_len;
 373         }
 374     }
 375 }
 376
 377 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
 378    null-terminated string owned by the caller.  HEAD, TAIL, and the returned
 379    string are all encoded in UTF-8.  As many characters[*] from the beginning
 380    of HEAD are included as will fit within MAX_LEN bytes supposing that the
 381    resulting string were to be re-encoded in ENCODING.  All of TAIL is always
 382    included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
 383
 384    [*] Actually this function drops grapheme clusters instead of characters, so
 385        that, e.g. a Unicode character followed by a combining accent character
 386        is either completely included or completely excluded from the returned
 387        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 388        information on grapheme clusters.
 389
 390    A null ENCODING is treated as UTF-8.
 391
 392    Simple examples for encoding="UTF-8", max_len=6:
 393
 394        head="abc",  tail="xyz"     => "abcxyz"
 395        head="abcd", tail="xyz"     => "abcxyz"
 396        head="abc",  tail="uvwxyz"  => "uvwxyz"
 397        head="abc",  tail="tuvwxyz" => "tuvwxyz"
 398
 399    Examples for encoding="ISO-8859-1", max_len=6:
 400
 401        head="éèä",  tail="xyz"    => "éèäxyz"
 402          (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
 403           each take 2 bytes in UTF-8 encoding)
 404 */
 405 char *
 406 utf8_encoding_concat (const char *head, const char *tail,
 407                       const char *encoding, size_t max_len)
 408 {
 409   size_t tail_len = strlen (tail);
 410   size_t prefix_len;
 411   char *result;
 412
 413   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 414                                        encoding, max_len, &result);
 415   return (result != NULL
 416           ? result
 417           : xconcat2 (head, prefix_len, tail, tail_len));
 418 }
 419
 420 /* Returns the length, in bytes, of the string that would be returned by
 421    utf8_encoding_concat() if passed the same arguments, but the implementation
 422    is often more efficient. */
 423 size_t
 424 utf8_encoding_concat_len (const char *head, const char *tail,
 425                           const char *encoding, size_t max_len)
 426 {
 427   size_t tail_len = strlen (tail);
 428   size_t prefix_len;
 429   char *result;
 430
 431   prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
 432                                        encoding, max_len, &result);
 433   free (result);
 434   return prefix_len + tail_len;
 435 }
 436
 437 /* Returns an allocated, null-terminated string, owned by the caller,
 438    containing as many characters[*] from the beginning of S that would fit
 439    within MAX_LEN bytes if the returned string were to be re-encoded in
 440    ENCODING.  Both S and the returned string are encoded in UTF-8.
 441
 442    [*] Actually this function drops grapheme clusters instead of characters, so
 443        that, e.g. a Unicode character followed by a combining accent character
 444        is either completely included or completely excluded from the returned
 445        string.  See UAX #29 at http://unicode.org/reports/tr29/ for more
 446        information on grapheme clusters.
 447
 448    A null ENCODING is treated as UTF-8.
 449 */
 450 char *
 451 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
 452 {
 453   return utf8_encoding_concat (s, "", encoding, max_len);
 454 }
 455
 456 /* Returns the length, in bytes, of the string that would be returned by
 457    utf8_encoding_trunc() if passed the same arguments, but the implementation
 458    is often more efficient. */
 459 size_t
 460 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
 461 {
 462   return utf8_encoding_concat_len (s, "", encoding, max_len);
 463 }
 464
 465 /* Returns FILENAME converted from UTF-8 to the filename encoding.
 466    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 467    current locale. */
 468 char *
 469 utf8_to_filename (const char *filename)
 470 {
 471   return recode_string (filename_encoding (), "UTF-8", filename, -1);
 472 }
 473
 474 /* Returns FILENAME converted from the filename encoding to UTF-8.
 475    On Windows the filename encoding is UTF-8; elsewhere it is based on the
 476    current locale. */
 477 char *
 478 filename_to_utf8 (const char *filename)
 479 {
 480   return recode_string ("UTF-8", filename_encoding (), filename, -1);
 481 }
 482
 483 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
 484    dynamically allocated string in TO-encoding.  Any characters which cannot be
 485    converted will be represented by '?'.
 486
 487    The returned string will be null-terminated and allocated on POOL.
 488
 489    This function's behaviour differs from that of g_convert_with_fallback
 490    provided by GLib.  The GLib function will fail (returns NULL) if any part of
 491    the input string is not valid in the declared input encoding.  This function
 492    however perseveres even in the presence of badly encoded input. */
 493 struct substring
 494 recode_substring_pool (const char *to, const char *from,
 495                        struct substring text, struct pool *pool)
 496 {
 497   size_t outbufferlength;
 498   iconv_t conv ;
 499
 500   if (to == NULL)
 501     to = default_encoding;
 502
 503   if (from == NULL)
 504     from = default_encoding;
 505
 506   conv = create_iconv (to, from);
 507
 508   if ( (iconv_t) -1 == conv )
 509     {
 510       struct substring out;
 511       ss_alloc_substring_pool (&out, text, pool);
 512       return out;
 513     }
 514
 515   for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
 516     if ( outbufferlength > text.length)
 517       {
 518         char *output = pool_malloc (pool, outbufferlength);
 519         ssize_t output_len = try_recode (conv, text.string, text.length,
 520                                          output, outbufferlength);
 521         if (output_len >= 0)
 522           return ss_buffer (output, output_len);
 523         pool_free (pool, output);
 524       }
 525
 526   NOT_REACHED ();
 527 }
 528
 529 void
 530 i18n_init (void)
 531 {
 532   setlocale (LC_CTYPE, "");
 533   setlocale (LC_MESSAGES, "");
 534 #if HAVE_LC_PAPER
 535   setlocale (LC_PAPER, "");
 536 #endif
 537   bindtextdomain (PACKAGE, relocate(locale_dir));
 538   textdomain (PACKAGE);
 539
 540   assert (default_encoding == NULL);
 541   default_encoding = xstrdup (locale_charset ());
 542
 543   hmapx_init (&map);
 544 }
 545
 546 const char *
 547 get_default_encoding (void)
 548 {
 549   return default_encoding;
 550 }
 551
 552 void
 553 set_default_encoding (const char *enc)
 554 {
 555   free (default_encoding);
 556   default_encoding = xstrdup (enc);
 557 }
 558
 559
 560 /* Attempts to set the encoding from a locale name
 561    returns true if successfull.
 562    This function does not (should not!) alter the current locale.
 563 */
 564 bool
 565 set_encoding_from_locale (const char *loc)
 566 {
 567   bool ok = true;
 568   char *c_encoding;
 569   char *loc_encoding;
 570   char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
 571
 572   setlocale (LC_CTYPE, "C");
 573   c_encoding = xstrdup (locale_charset ());
 574
 575   setlocale (LC_CTYPE, loc);
 576   loc_encoding = xstrdup (locale_charset ());
 577
 578
 579   if ( 0 == strcmp (loc_encoding, c_encoding))
 580     {
 581       ok = false;
 582     }
 583
 584
 585   setlocale (LC_CTYPE, tmp);
 586
 587   free (tmp);
 588
 589   if (ok)
 590     {
 591       free (default_encoding);
 592       default_encoding = loc_encoding;
 593     }
 594   else
 595     free (loc_encoding);
 596
 597   free (c_encoding);
 598
 599   return ok;
 600 }
 601
 602 void
 603 i18n_done (void)
 604 {
 605   struct hmapx_node *node;
 606   struct converter *cvtr;
 607
 608   HMAPX_FOR_EACH (cvtr, node, &map)
 609     {
 610       free (cvtr->tocode);
 611       free (cvtr->fromcode);
 612       if (cvtr->conv != (iconv_t) -1)
 613         iconv_close (cvtr->conv);
 614       free (cvtr);
 615     }
 616
 617   hmapx_destroy (&map);
 618
 619   free (default_encoding);
 620   default_encoding = NULL;
 621 }
 622
 623
 624
 625 bool
 626 valid_encoding (const char *enc)
 627 {
 628   iconv_t conv = iconv_open (UTF8, enc);
 629
 630   if ( conv == (iconv_t) -1)
 631     return false;
 632
 633   iconv_close (conv);
 634
 635   return true;
 636 }
 637
 638
 639 /* Return the system local's idea of the
 640    decimal seperator character */
 641 char
 642 get_system_decimal (void)
 643 {
 644   char radix_char;
 645
 646   char *ol = xstrdup (setlocale (LC_NUMERIC, NULL));
 647   setlocale (LC_NUMERIC, "");
 648
 649 #if HAVE_NL_LANGINFO
 650   radix_char = nl_langinfo (RADIXCHAR)[0];
 651 #else
 652   {
 653     char buf[10];
 654     snprintf (buf, sizeof buf, "%f", 2.5);
 655     radix_char = buf[1];
 656   }
 657 #endif
 658
 659   /* We MUST leave LC_NUMERIC untouched, since it would
 660      otherwise interfere with data_{in,out} */
 661   setlocale (LC_NUMERIC, ol);
 662   free (ol);
 663   return radix_char;
 664 }
 665
 666 const char *
 667 uc_name (ucs4_t uc, char buffer[16])
 668 {
 669   if (uc >= 0x20 && uc < 0x7f)
 670     snprintf (buffer, 16, "`%c'", uc);
 671   else
 672     snprintf (buffer, 16, "U+%04X", uc);
 673   return buffer;
 674 }
 675 \f
 676 bool
 677 get_encoding_info (struct encoding_info *e, const char *name)
 678 {
 679   const struct substring in = SS_LITERAL_INITIALIZER (
 680     "\t\n\v\f\r "
 681     "!\"#$%&'()*+,-./0123456789:;<=>?@"
 682     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
 683     "abcdefghijklmnopqrstuvwxyz{|}~");
 684
 685   struct substring out, cr, lf;
 686   bool ok;
 687
 688   memset (e, 0, sizeof *e);
 689
 690   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
 691   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
 692   ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
 693   if (!ok)
 694     {
 695       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
 696       ss_dealloc (&cr);
 697       ss_dealloc (&lf);
 698       ss_alloc_substring (&cr, ss_cstr ("\r"));
 699       ss_alloc_substring (&lf, ss_cstr ("\n"));
 700     }
 701
 702   e->unit = cr.length;
 703   memcpy (e->cr, cr.string, e->unit);
 704   memcpy (e->lf, lf.string, e->unit);
 705
 706   ss_dealloc (&cr);
 707   ss_dealloc (&lf);
 708
 709   out = recode_substring_pool ("UTF-8", name, in, NULL);
 710   e->is_ascii_compatible = ss_equals (in, out);
 711   ss_dealloc (&out);
 712
 713   return ok;
 714 }
 715
 716 bool
 717 is_encoding_ascii_compatible (const char *encoding)
 718 {
 719   struct encoding_info e;
 720
 721   get_encoding_info (&e, encoding);
 722   return e.is_ascii_compatible;
 723 }