1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/i18n.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/hmapx.h"
34 #include "libpspp/hash-functions.h"
35 #include "libpspp/pool.h"
36 #include "libpspp/str.h"
37 #include "libpspp/version.h"
39 #include "gl/c-strcase.h"
40 #include "gl/localcharset.h"
41 #include "gl/xalloc.h"
42 #include "gl/relocatable.h"
43 #include "gl/xstrndup.h"
53 static char *default_encoding;
54 static struct hmapx map;
56 /* A wrapper around iconv_open */
57 static struct converter *
58 create_iconv__ (const char* tocode, const char* fromcode)
61 struct hmapx_node *node;
62 struct converter *converter;
65 hash = hash_string (tocode, hash_string (fromcode, 0));
66 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
67 if (!strcmp (tocode, converter->tocode)
68 && !strcmp (fromcode, converter->fromcode))
71 converter = xmalloc (sizeof *converter);
72 converter->tocode = xstrdup (tocode);
73 converter->fromcode = xstrdup (fromcode);
74 converter->conv = iconv_open (tocode, fromcode);
75 converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
76 hmapx_insert (&map, converter, hash);
82 create_iconv (const char* tocode, const char* fromcode)
84 struct converter *converter;
86 converter = create_iconv__ (tocode, fromcode);
88 /* I don't think it's safe to translate this string or to use messaging
89 as the converters have not yet been set up */
90 if (converter->error && strcmp (tocode, fromcode))
94 "cannot create a converter for `%s' to `%s': %s\n",
95 fromcode, tocode, strerror (converter->error));
99 return converter->conv;
102 /* Converts the single byte C from encoding FROM to TO, returning the first
105 This function probably shouldn't be used at all, but some code still does
108 recode_byte (const char *to, const char *from, char c)
111 char *s = recode_string (to, from, &c, 1);
117 /* Similar to recode_string_pool, but allocates the returned value on the heap
118 instead of in a pool. It is the caller's responsibility to free the
121 recode_string (const char *to, const char *from,
122 const char *text, int length)
124 return recode_string_pool (to, from, text, length, NULL);
127 /* Returns the length, in bytes, of the string that a similar recode_string()
128 call would return. */
130 recode_string_len (const char *to, const char *from,
131 const char *text, int length)
133 char *s = recode_string (to, from, text, length);
134 size_t len = strlen (s);
139 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
140 at OP, and appends a null terminator to the output.
142 Returns the output length if successful, -1 if the output buffer is too
145 try_recode (iconv_t conv,
146 const char *ip, size_t inbytes,
147 char *op_, size_t outbytes)
149 /* FIXME: Need to ensure that this char is valid in the target encoding */
150 const char fallbackchar = '?';
153 /* Put the converter into the initial shift state, in case there was any
154 state information left over from its last usage. */
155 iconv (conv, NULL, 0, NULL, 0);
157 while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
158 &op, &outbytes) == -1)
164 *op++ = fallbackchar;
171 *op++ = fallbackchar;
181 /* should never happen */
182 fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
194 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
195 dynamically allocated string in TO-encoding. Any characters which cannot be
196 converted will be represented by '?'.
198 LENGTH should be the length of the string or -1, if null terminated.
200 The returned string will be allocated on POOL.
202 This function's behaviour differs from that of g_convert_with_fallback
203 provided by GLib. The GLib function will fail (returns NULL) if any part of
204 the input string is not valid in the declared input encoding. This function
205 however perseveres even in the presence of badly encoded input. */
207 recode_string_pool (const char *to, const char *from,
208 const char *text, int length, struct pool *pool)
210 struct substring out;
216 length = strlen (text);
218 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
222 /* Returns the name of the encoding that should be used for file names.
224 This is meant to be the same encoding used by g_filename_from_uri() and
225 g_filename_to_uri() in GLib. */
227 filename_encoding (void)
229 #if defined _WIN32 || defined __WIN32__
232 return locale_charset ();
237 xconcat2 (const char *a, size_t a_len,
238 const char *b, size_t b_len)
240 char *s = xmalloc (a_len + b_len + 1);
241 memcpy (s, a, a_len);
242 memcpy (s + a_len, b, b_len);
243 s[a_len + b_len] = '\0';
247 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
248 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
249 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
250 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
251 HEAD and tries again, repeating as necessary until the concatenated result
252 fits or until HEAD_LEN reaches 0.
254 [*] Actually this function drops grapheme clusters instead of characters, so
255 that, e.g. a Unicode character followed by a combining accent character
256 is either completely included or completely excluded from HEAD_LEN. See
257 UAX #29 at http://unicode.org/reports/tr29/ for more information on
260 A null ENCODING is treated as UTF-8.
262 Sometimes this function has to actually construct the concatenated string to
263 measure its length. When this happens, it sets *RESULTP to that
264 null-terminated string, allocated with malloc(), for the caller to use if it
265 needs it. Otherwise, it sets *RESULTP to NULL.
267 Simple examples for encoding="UTF-8", max_len=6:
269 head="abc", tail="xyz" => 3
270 head="abcd", tail="xyz" => 3 ("d" dropped).
271 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
272 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
274 Examples for encoding="ISO-8859-1", max_len=6:
276 head="éèä", tail="xyz" => 6
277 (each letter in head is only 1 byte in ISO-8859-1 even though they
278 each take 2 bytes in UTF-8 encoding)
281 utf8_encoding_concat__ (const char *head, size_t head_len,
282 const char *tail, size_t tail_len,
283 const char *encoding, size_t max_len,
289 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
291 if (head_len + tail_len <= max_len)
293 else if (tail_len >= max_len)
303 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
305 ofs <= max_len - tail_len;
310 mblen = u8_mbtouc (&next,
311 CHAR_CAST (const uint8_t *, head + ofs),
313 if (uc_is_grapheme_break (prev, next))
326 result = (tail_len > 0
327 ? xconcat2 (head, head_len, tail, tail_len)
328 : CONST_CAST (char *, head));
329 if (recode_string_len (encoding, "UTF-8", result,
330 head_len + tail_len) <= max_len)
332 *resultp = result != head ? result : NULL;
337 bool correct_result = false;
344 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
351 mblen = u8_mbtouc (&next,
352 CHAR_CAST (const uint8_t *, head + ofs),
354 if (uc_is_grapheme_break (prev, next))
358 memcpy (result, head, ofs);
359 memcpy (result + ofs, tail, tail_len);
360 result[ofs + tail_len] = '\0';
363 if (recode_string_len (encoding, "UTF-8", result,
364 ofs + tail_len) <= max_len)
366 correct_result = true;
370 correct_result = false;
389 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
390 null-terminated string owned by the caller. HEAD, TAIL, and the returned
391 string are all encoded in UTF-8. As many characters[*] from the beginning
392 of HEAD are included as will fit within MAX_LEN bytes supposing that the
393 resulting string were to be re-encoded in ENCODING. All of TAIL is always
394 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
396 [*] Actually this function drops grapheme clusters instead of characters, so
397 that, e.g. a Unicode character followed by a combining accent character
398 is either completely included or completely excluded from the returned
399 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
400 information on grapheme clusters.
402 A null ENCODING is treated as UTF-8.
404 Simple examples for encoding="UTF-8", max_len=6:
406 head="abc", tail="xyz" => "abcxyz"
407 head="abcd", tail="xyz" => "abcxyz"
408 head="abc", tail="uvwxyz" => "uvwxyz"
409 head="abc", tail="tuvwxyz" => "tuvwxyz"
411 Examples for encoding="ISO-8859-1", max_len=6:
413 head="éèä", tail="xyz" => "éèäxyz"
414 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
415 each take 2 bytes in UTF-8 encoding)
418 utf8_encoding_concat (const char *head, const char *tail,
419 const char *encoding, size_t max_len)
421 size_t tail_len = strlen (tail);
425 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
426 encoding, max_len, &result);
427 return (result != NULL
429 : xconcat2 (head, prefix_len, tail, tail_len));
432 /* Returns the length, in bytes, of the string that would be returned by
433 utf8_encoding_concat() if passed the same arguments, but the implementation
434 is often more efficient. */
436 utf8_encoding_concat_len (const char *head, const char *tail,
437 const char *encoding, size_t max_len)
439 size_t tail_len = strlen (tail);
443 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
444 encoding, max_len, &result);
446 return prefix_len + tail_len;
449 /* Returns an allocated, null-terminated string, owned by the caller,
450 containing as many characters[*] from the beginning of S that would fit
451 within MAX_LEN bytes if the returned string were to be re-encoded in
452 ENCODING. Both S and the returned string are encoded in UTF-8.
454 [*] Actually this function drops grapheme clusters instead of characters, so
455 that, e.g. a Unicode character followed by a combining accent character
456 is either completely included or completely excluded from the returned
457 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
458 information on grapheme clusters.
460 A null ENCODING is treated as UTF-8.
463 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
465 return utf8_encoding_concat (s, "", encoding, max_len);
468 /* Returns the length, in bytes, of the string that would be returned by
469 utf8_encoding_trunc() if passed the same arguments, but the implementation
470 is often more efficient. */
472 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
474 return utf8_encoding_concat_len (s, "", encoding, max_len);
477 /* Returns FILENAME converted from UTF-8 to the filename encoding.
478 On Windows the filename encoding is UTF-8; elsewhere it is based on the
481 utf8_to_filename (const char *filename)
483 return recode_string (filename_encoding (), "UTF-8", filename, -1);
486 /* Returns FILENAME converted from the filename encoding to UTF-8.
487 On Windows the filename encoding is UTF-8; elsewhere it is based on the
490 filename_to_utf8 (const char *filename)
492 return recode_string ("UTF-8", filename_encoding (), filename, -1);
495 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
496 dynamically allocated string in TO-encoding. Any characters which cannot be
497 converted will be represented by '?'.
499 The returned string will be null-terminated and allocated on POOL.
501 This function's behaviour differs from that of g_convert_with_fallback
502 provided by GLib. The GLib function will fail (returns NULL) if any part of
503 the input string is not valid in the declared input encoding. This function
504 however perseveres even in the presence of badly encoded input. */
506 recode_substring_pool (const char *to, const char *from,
507 struct substring text, struct pool *pool)
509 size_t outbufferlength;
513 to = default_encoding;
516 from = default_encoding;
518 conv = create_iconv (to, from);
520 if ( (iconv_t) -1 == conv )
522 struct substring out;
523 ss_alloc_substring_pool (&out, text, pool);
527 for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
528 if ( outbufferlength > text.length)
530 char *output = pool_malloc (pool, outbufferlength);
531 ssize_t output_len = try_recode (conv, text.string, text.length,
532 output, outbufferlength);
534 return ss_buffer (output, output_len);
535 pool_free (pool, output);
544 setlocale (LC_CTYPE, "");
545 setlocale (LC_MESSAGES, "");
547 setlocale (LC_PAPER, "");
549 bindtextdomain (PACKAGE, relocate(locale_dir));
550 textdomain (PACKAGE);
552 assert (default_encoding == NULL);
553 default_encoding = xstrdup (locale_charset ());
559 get_default_encoding (void)
561 return default_encoding;
565 set_default_encoding (const char *enc)
567 free (default_encoding);
568 default_encoding = xstrdup (enc);
572 /* Attempts to set the encoding from a locale name
573 returns true if successfull.
574 This function does not (should not!) alter the current locale.
577 set_encoding_from_locale (const char *loc)
582 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
584 setlocale (LC_CTYPE, "C");
585 c_encoding = xstrdup (locale_charset ());
587 setlocale (LC_CTYPE, loc);
588 loc_encoding = xstrdup (locale_charset ());
591 if ( 0 == strcmp (loc_encoding, c_encoding))
597 setlocale (LC_CTYPE, tmp);
603 free (default_encoding);
604 default_encoding = loc_encoding;
617 struct hmapx_node *node;
618 struct converter *cvtr;
620 HMAPX_FOR_EACH (cvtr, node, &map)
623 free (cvtr->fromcode);
624 if (cvtr->conv != (iconv_t) -1)
625 iconv_close (cvtr->conv);
629 hmapx_destroy (&map);
631 free (default_encoding);
632 default_encoding = NULL;
638 valid_encoding (const char *enc)
640 iconv_t conv = iconv_open (UTF8, enc);
642 if ( conv == (iconv_t) -1)
651 /* Return the system local's idea of the
652 decimal seperator character */
654 get_system_decimal (void)
658 char *ol = xstrdup (setlocale (LC_NUMERIC, NULL));
659 setlocale (LC_NUMERIC, "");
662 radix_char = nl_langinfo (RADIXCHAR)[0];
666 snprintf (buf, sizeof buf, "%f", 2.5);
671 /* We MUST leave LC_NUMERIC untouched, since it would
672 otherwise interfere with data_{in,out} */
673 setlocale (LC_NUMERIC, ol);
679 uc_name (ucs4_t uc, char buffer[16])
681 if (uc >= 0x20 && uc < 0x7f)
682 snprintf (buffer, 16, "`%c'", uc);
684 snprintf (buffer, 16, "U+%04X", uc);
689 get_encoding_info (struct encoding_info *e, const char *name)
691 const struct substring in = SS_LITERAL_INITIALIZER (
693 "!\"#$%&'()*+,-./0123456789:;<=>?@"
694 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
695 "abcdefghijklmnopqrstuvwxyz{|}~");
697 struct substring out, cr, lf;
700 memset (e, 0, sizeof *e);
702 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
703 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
704 ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
707 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
710 ss_alloc_substring (&cr, ss_cstr ("\r"));
711 ss_alloc_substring (&lf, ss_cstr ("\n"));
715 memcpy (e->cr, cr.string, e->unit);
716 memcpy (e->lf, lf.string, e->unit);
721 out = recode_substring_pool ("UTF-8", name, in, NULL);
722 e->is_ascii_compatible = ss_equals (in, out);
729 is_encoding_ascii_compatible (const char *encoding)
731 struct encoding_info e;
733 get_encoding_info (&e, encoding);
734 return e.is_ascii_compatible;
737 /* Returns true if iconv can convert ENCODING to and from UTF-8,
740 is_encoding_supported (const char *encoding)
742 return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
743 && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);