1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/i18n.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/hmapx.h"
34 #include "libpspp/hash-functions.h"
35 #include "libpspp/pool.h"
36 #include "libpspp/str.h"
37 #include "libpspp/version.h"
39 #include "gl/c-strcase.h"
40 #include "gl/localcharset.h"
41 #include "gl/xalloc.h"
42 #include "gl/relocatable.h"
43 #include "gl/xstrndup.h"
52 static char *default_encoding;
53 static struct hmapx map;
55 /* A wrapper around iconv_open */
57 create_iconv (const char* tocode, const char* fromcode)
60 struct hmapx_node *node;
61 struct converter *converter;
64 hash = hash_string (tocode, hash_string (fromcode, 0));
65 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
66 if (!strcmp (tocode, converter->tocode)
67 && !strcmp (fromcode, converter->fromcode))
68 return converter->conv;
70 converter = xmalloc (sizeof *converter);
71 converter->tocode = xstrdup (tocode);
72 converter->fromcode = xstrdup (fromcode);
73 converter->conv = iconv_open (tocode, fromcode);
74 hmapx_insert (&map, converter, hash);
76 /* I don't think it's safe to translate this string or to use messaging
77 as the converters have not yet been set up */
78 if ( (iconv_t) -1 == converter->conv && 0 != strcmp (tocode, fromcode))
80 const int err = errno;
83 "cannot create a converter for `%s' to `%s': %s\n",
84 fromcode, tocode, strerror (err));
87 return converter->conv;
90 /* Converts the single byte C from encoding FROM to TO, returning the first
93 This function probably shouldn't be used at all, but some code still does
96 recode_byte (const char *to, const char *from, char c)
99 char *s = recode_string (to, from, &c, 1);
105 /* Similar to recode_string_pool, but allocates the returned value on the heap
106 instead of in a pool. It is the caller's responsibility to free the
109 recode_string (const char *to, const char *from,
110 const char *text, int length)
112 return recode_string_pool (to, from, text, length, NULL);
115 /* Returns the length, in bytes, of the string that a similar recode_string()
116 call would return. */
118 recode_string_len (const char *to, const char *from,
119 const char *text, int length)
121 char *s = recode_string (to, from, text, length);
122 size_t len = strlen (s);
127 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
128 at OP, and appends a null terminator to the output.
130 Returns the output length if successful, -1 if the output buffer is too
133 try_recode (iconv_t conv,
134 const char *ip, size_t inbytes,
135 char *op_, size_t outbytes)
137 /* FIXME: Need to ensure that this char is valid in the target encoding */
138 const char fallbackchar = '?';
141 /* Put the converter into the initial shift state, in case there was any
142 state information left over from its last usage. */
143 iconv (conv, NULL, 0, NULL, 0);
145 while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
146 &op, &outbytes) == -1)
152 *op++ = fallbackchar;
159 *op++ = fallbackchar;
169 /* should never happen */
170 fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
182 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
183 dynamically allocated string in TO-encoding. Any characters which cannot be
184 converted will be represented by '?'.
186 LENGTH should be the length of the string or -1, if null terminated.
188 The returned string will be allocated on POOL.
190 This function's behaviour differs from that of g_convert_with_fallback
191 provided by GLib. The GLib function will fail (returns NULL) if any part of
192 the input string is not valid in the declared input encoding. This function
193 however perseveres even in the presence of badly encoded input. */
195 recode_string_pool (const char *to, const char *from,
196 const char *text, int length, struct pool *pool)
198 struct substring out;
204 length = strlen (text);
206 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
210 /* Returns the name of the encoding that should be used for file names.
212 This is meant to be the same encoding used by g_filename_from_uri() and
213 g_filename_to_uri() in GLib. */
215 filename_encoding (void)
217 #if defined _WIN32 || defined __WIN32__
220 return locale_charset ();
225 xconcat2 (const char *a, size_t a_len,
226 const char *b, size_t b_len)
228 char *s = xmalloc (a_len + b_len + 1);
229 memcpy (s, a, a_len);
230 memcpy (s + a_len, b, b_len);
231 s[a_len + b_len] = '\0';
235 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
236 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
237 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
238 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
239 HEAD and tries again, repeating as necessary until the concatenated result
240 fits or until HEAD_LEN reaches 0.
242 [*] Actually this function drops grapheme clusters instead of characters, so
243 that, e.g. a Unicode character followed by a combining accent character
244 is either completely included or completely excluded from HEAD_LEN. See
245 UAX #29 at http://unicode.org/reports/tr29/ for more information on
248 A null ENCODING is treated as UTF-8.
250 Sometimes this function has to actually construct the concatenated string to
251 measure its length. When this happens, it sets *RESULTP to that
252 null-terminated string, allocated with malloc(), for the caller to use if it
253 needs it. Otherwise, it sets *RESULTP to NULL.
255 Simple examples for encoding="UTF-8", max_len=6:
257 head="abc", tail="xyz" => 3
258 head="abcd", tail="xyz" => 3 ("d" dropped).
259 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
260 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
262 Examples for encoding="ISO-8859-1", max_len=6:
264 head="éèä", tail="xyz" => 6
265 (each letter in head is only 1 byte in ISO-8859-1 even though they
266 each take 2 bytes in UTF-8 encoding)
269 utf8_encoding_concat__ (const char *head, size_t head_len,
270 const char *tail, size_t tail_len,
271 const char *encoding, size_t max_len,
277 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
279 if (head_len + tail_len <= max_len)
281 else if (tail_len >= max_len)
291 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
293 ofs <= max_len - tail_len;
298 mblen = u8_mbtouc (&next,
299 CHAR_CAST (const uint8_t *, head + ofs),
301 if (uc_is_grapheme_break (prev, next))
314 result = (tail_len > 0
315 ? xconcat2 (head, head_len, tail, tail_len)
316 : CONST_CAST (char *, head));
317 if (recode_string_len (encoding, "UTF-8", result,
318 head_len + tail_len) <= max_len)
320 *resultp = result != head ? result : NULL;
325 bool correct_result = false;
332 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
339 mblen = u8_mbtouc (&next,
340 CHAR_CAST (const uint8_t *, head + ofs),
342 if (uc_is_grapheme_break (prev, next))
346 memcpy (result, head, ofs);
347 memcpy (result + ofs, tail, tail_len);
348 result[ofs + tail_len] = '\0';
351 if (recode_string_len (encoding, "UTF-8", result,
352 ofs + tail_len) <= max_len)
354 correct_result = true;
358 correct_result = false;
377 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
378 null-terminated string owned by the caller. HEAD, TAIL, and the returned
379 string are all encoded in UTF-8. As many characters[*] from the beginning
380 of HEAD are included as will fit within MAX_LEN bytes supposing that the
381 resulting string were to be re-encoded in ENCODING. All of TAIL is always
382 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
384 [*] Actually this function drops grapheme clusters instead of characters, so
385 that, e.g. a Unicode character followed by a combining accent character
386 is either completely included or completely excluded from the returned
387 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
388 information on grapheme clusters.
390 A null ENCODING is treated as UTF-8.
392 Simple examples for encoding="UTF-8", max_len=6:
394 head="abc", tail="xyz" => "abcxyz"
395 head="abcd", tail="xyz" => "abcxyz"
396 head="abc", tail="uvwxyz" => "uvwxyz"
397 head="abc", tail="tuvwxyz" => "tuvwxyz"
399 Examples for encoding="ISO-8859-1", max_len=6:
401 head="éèä", tail="xyz" => "éèäxyz"
402 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
403 each take 2 bytes in UTF-8 encoding)
406 utf8_encoding_concat (const char *head, const char *tail,
407 const char *encoding, size_t max_len)
409 size_t tail_len = strlen (tail);
413 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
414 encoding, max_len, &result);
415 return (result != NULL
417 : xconcat2 (head, prefix_len, tail, tail_len));
420 /* Returns the length, in bytes, of the string that would be returned by
421 utf8_encoding_concat() if passed the same arguments, but the implementation
422 is often more efficient. */
424 utf8_encoding_concat_len (const char *head, const char *tail,
425 const char *encoding, size_t max_len)
427 size_t tail_len = strlen (tail);
431 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
432 encoding, max_len, &result);
434 return prefix_len + tail_len;
437 /* Returns an allocated, null-terminated string, owned by the caller,
438 containing as many characters[*] from the beginning of S that would fit
439 within MAX_LEN bytes if the returned string were to be re-encoded in
440 ENCODING. Both S and the returned string are encoded in UTF-8.
442 [*] Actually this function drops grapheme clusters instead of characters, so
443 that, e.g. a Unicode character followed by a combining accent character
444 is either completely included or completely excluded from the returned
445 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
446 information on grapheme clusters.
448 A null ENCODING is treated as UTF-8.
451 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
453 return utf8_encoding_concat (s, "", encoding, max_len);
456 /* Returns the length, in bytes, of the string that would be returned by
457 utf8_encoding_trunc() if passed the same arguments, but the implementation
458 is often more efficient. */
460 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
462 return utf8_encoding_concat_len (s, "", encoding, max_len);
465 /* Returns FILENAME converted from UTF-8 to the filename encoding.
466 On Windows the filename encoding is UTF-8; elsewhere it is based on the
469 utf8_to_filename (const char *filename)
471 return recode_string (filename_encoding (), "UTF-8", filename, -1);
474 /* Returns FILENAME converted from the filename encoding to UTF-8.
475 On Windows the filename encoding is UTF-8; elsewhere it is based on the
478 filename_to_utf8 (const char *filename)
480 return recode_string ("UTF-8", filename_encoding (), filename, -1);
483 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
484 dynamically allocated string in TO-encoding. Any characters which cannot be
485 converted will be represented by '?'.
487 The returned string will be null-terminated and allocated on POOL.
489 This function's behaviour differs from that of g_convert_with_fallback
490 provided by GLib. The GLib function will fail (returns NULL) if any part of
491 the input string is not valid in the declared input encoding. This function
492 however perseveres even in the presence of badly encoded input. */
494 recode_substring_pool (const char *to, const char *from,
495 struct substring text, struct pool *pool)
497 size_t outbufferlength;
501 to = default_encoding;
504 from = default_encoding;
506 conv = create_iconv (to, from);
508 if ( (iconv_t) -1 == conv )
510 struct substring out;
511 ss_alloc_substring_pool (&out, text, pool);
515 for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
516 if ( outbufferlength > text.length)
518 char *output = pool_malloc (pool, outbufferlength);
519 ssize_t output_len = try_recode (conv, text.string, text.length,
520 output, outbufferlength);
522 return ss_buffer (output, output_len);
523 pool_free (pool, output);
532 setlocale (LC_CTYPE, "");
533 setlocale (LC_MESSAGES, "");
535 setlocale (LC_PAPER, "");
537 bindtextdomain (PACKAGE, relocate(locale_dir));
538 textdomain (PACKAGE);
540 assert (default_encoding == NULL);
541 default_encoding = xstrdup (locale_charset ());
547 get_default_encoding (void)
549 return default_encoding;
553 set_default_encoding (const char *enc)
555 free (default_encoding);
556 default_encoding = xstrdup (enc);
560 /* Attempts to set the encoding from a locale name
561 returns true if successfull.
562 This function does not (should not!) alter the current locale.
565 set_encoding_from_locale (const char *loc)
570 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
572 setlocale (LC_CTYPE, "C");
573 c_encoding = xstrdup (locale_charset ());
575 setlocale (LC_CTYPE, loc);
576 loc_encoding = xstrdup (locale_charset ());
579 if ( 0 == strcmp (loc_encoding, c_encoding))
585 setlocale (LC_CTYPE, tmp);
591 free (default_encoding);
592 default_encoding = loc_encoding;
605 struct hmapx_node *node;
606 struct converter *cvtr;
608 HMAPX_FOR_EACH (cvtr, node, &map)
611 free (cvtr->fromcode);
612 if (cvtr->conv != (iconv_t) -1)
613 iconv_close (cvtr->conv);
617 hmapx_destroy (&map);
619 free (default_encoding);
620 default_encoding = NULL;
626 valid_encoding (const char *enc)
628 iconv_t conv = iconv_open (UTF8, enc);
630 if ( conv == (iconv_t) -1)
639 /* Return the system local's idea of the
640 decimal seperator character */
642 get_system_decimal (void)
646 char *ol = xstrdup (setlocale (LC_NUMERIC, NULL));
647 setlocale (LC_NUMERIC, "");
650 radix_char = nl_langinfo (RADIXCHAR)[0];
654 snprintf (buf, sizeof buf, "%f", 2.5);
659 /* We MUST leave LC_NUMERIC untouched, since it would
660 otherwise interfere with data_{in,out} */
661 setlocale (LC_NUMERIC, ol);
667 uc_name (ucs4_t uc, char buffer[16])
669 if (uc >= 0x20 && uc < 0x7f)
670 snprintf (buffer, 16, "`%c'", uc);
672 snprintf (buffer, 16, "U+%04X", uc);
677 get_encoding_info (struct encoding_info *e, const char *name)
679 const struct substring in = SS_LITERAL_INITIALIZER (
681 "!\"#$%&'()*+,-./0123456789:;<=>?@"
682 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
683 "abcdefghijklmnopqrstuvwxyz{|}~");
685 struct substring out, cr, lf;
688 memset (e, 0, sizeof *e);
690 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
691 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
692 ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
695 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
698 ss_alloc_substring (&cr, ss_cstr ("\r"));
699 ss_alloc_substring (&lf, ss_cstr ("\n"));
703 memcpy (e->cr, cr.string, e->unit);
704 memcpy (e->lf, lf.string, e->unit);
709 out = recode_substring_pool ("UTF-8", name, in, NULL);
710 e->is_ascii_compatible = ss_equals (in, out);
717 is_encoding_ascii_compatible (const char *encoding)
719 struct encoding_info e;
721 get_encoding_info (&e, encoding);
722 return e.is_ascii_compatible;