1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/i18n.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/compiler.h"
34 #include "libpspp/hmapx.h"
35 #include "libpspp/hash-functions.h"
36 #include "libpspp/pool.h"
37 #include "libpspp/str.h"
38 #include "libpspp/version.h"
40 #include "gl/c-strcase.h"
41 #include "gl/localcharset.h"
42 #include "gl/xalloc.h"
43 #include "gl/relocatable.h"
44 #include "gl/xstrndup.h"
47 #define _(msgid) gettext (msgid)
57 static char *default_encoding;
58 static struct hmapx map;
60 /* A wrapper around iconv_open */
61 static struct converter *
62 create_iconv__ (const char* tocode, const char* fromcode)
65 struct hmapx_node *node;
66 struct converter *converter;
69 hash = hash_string (tocode, hash_string (fromcode, 0));
70 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
71 if (!strcmp (tocode, converter->tocode)
72 && !strcmp (fromcode, converter->fromcode))
75 converter = xmalloc (sizeof *converter);
76 converter->tocode = xstrdup (tocode);
77 converter->fromcode = xstrdup (fromcode);
78 converter->conv = iconv_open (tocode, fromcode);
79 converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
80 hmapx_insert (&map, converter, hash);
86 create_iconv (const char* tocode, const char* fromcode)
88 struct converter *converter;
90 converter = create_iconv__ (tocode, fromcode);
92 /* I don't think it's safe to translate this string or to use messaging
93 as the converters have not yet been set up */
94 if (converter->error && strcmp (tocode, fromcode))
98 "cannot create a converter for `%s' to `%s': %s\n",
99 fromcode, tocode, strerror (converter->error));
100 converter->error = 0;
103 return converter->conv;
106 /* Converts the single byte C from encoding FROM to TO, returning the first
109 This function probably shouldn't be used at all, but some code still does
112 recode_byte (const char *to, const char *from, char c)
115 char *s = recode_string (to, from, &c, 1);
121 /* Similar to recode_string_pool, but allocates the returned value on the heap
122 instead of in a pool. It is the caller's responsibility to free the
125 recode_string (const char *to, const char *from,
126 const char *text, int length)
128 return recode_string_pool (to, from, text, length, NULL);
131 /* Returns the length, in bytes, of the string that a similar recode_string()
132 call would return. */
134 recode_string_len (const char *to, const char *from,
135 const char *text, int length)
137 char *s = recode_string (to, from, text, length);
138 size_t len = strlen (s);
143 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
144 at OP, and appends a null terminator to the output.
146 Returns the output length if successful, -1 if the output buffer is too
149 try_recode (iconv_t conv,
150 const char *ip, size_t inbytes,
151 char *op_, size_t outbytes)
153 /* FIXME: Need to ensure that this char is valid in the target encoding */
154 const char fallbackchar = '?';
157 /* Put the converter into the initial shift state, in case there was any
158 state information left over from its last usage. */
159 iconv (conv, NULL, 0, NULL, 0);
161 while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
162 &op, &outbytes) == -1)
168 *op++ = fallbackchar;
175 *op++ = fallbackchar;
185 /* should never happen */
186 fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
198 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
199 dynamically allocated string in TO-encoding. Any characters which cannot be
200 converted will be represented by '?'.
202 LENGTH should be the length of the string or -1, if null terminated.
204 The returned string will be allocated on POOL.
206 This function's behaviour differs from that of g_convert_with_fallback
207 provided by GLib. The GLib function will fail (returns NULL) if any part of
208 the input string is not valid in the declared input encoding. This function
209 however perseveres even in the presence of badly encoded input. */
211 recode_string_pool (const char *to, const char *from,
212 const char *text, int length, struct pool *pool)
214 struct substring out;
220 length = strlen (text);
222 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
226 /* Returns the name of the encoding that should be used for file names.
228 This is meant to be the same encoding used by g_filename_from_uri() and
229 g_filename_to_uri() in GLib. */
231 filename_encoding (void)
233 #if defined _WIN32 || defined __WIN32__
236 return locale_charset ();
241 xconcat2 (const char *a, size_t a_len,
242 const char *b, size_t b_len)
244 char *s = xmalloc (a_len + b_len + 1);
245 memcpy (s, a, a_len);
246 memcpy (s + a_len, b, b_len);
247 s[a_len + b_len] = '\0';
251 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
252 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
253 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
254 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
255 HEAD and tries again, repeating as necessary until the concatenated result
256 fits or until HEAD_LEN reaches 0.
258 [*] Actually this function drops grapheme clusters instead of characters, so
259 that, e.g. a Unicode character followed by a combining accent character
260 is either completely included or completely excluded from HEAD_LEN. See
261 UAX #29 at http://unicode.org/reports/tr29/ for more information on
264 A null ENCODING is treated as UTF-8.
266 Sometimes this function has to actually construct the concatenated string to
267 measure its length. When this happens, it sets *RESULTP to that
268 null-terminated string, allocated with malloc(), for the caller to use if it
269 needs it. Otherwise, it sets *RESULTP to NULL.
271 Simple examples for encoding="UTF-8", max_len=6:
273 head="abc", tail="xyz" => 3
274 head="abcd", tail="xyz" => 3 ("d" dropped).
275 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
276 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
278 Examples for encoding="ISO-8859-1", max_len=6:
280 head="éèä", tail="xyz" => 6
281 (each letter in head is only 1 byte in ISO-8859-1 even though they
282 each take 2 bytes in UTF-8 encoding)
285 utf8_encoding_concat__ (const char *head, size_t head_len,
286 const char *tail, size_t tail_len,
287 const char *encoding, size_t max_len,
293 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
295 if (head_len + tail_len <= max_len)
297 else if (tail_len >= max_len)
307 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
309 ofs <= max_len - tail_len;
314 mblen = u8_mbtouc (&next,
315 CHAR_CAST (const uint8_t *, head + ofs),
317 if (uc_is_grapheme_break (prev, next))
330 result = (tail_len > 0
331 ? xconcat2 (head, head_len, tail, tail_len)
332 : CONST_CAST (char *, head));
333 if (recode_string_len (encoding, "UTF-8", result,
334 head_len + tail_len) <= max_len)
336 *resultp = result != head ? result : NULL;
341 bool correct_result = false;
348 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
355 mblen = u8_mbtouc (&next,
356 CHAR_CAST (const uint8_t *, head + ofs),
358 if (uc_is_grapheme_break (prev, next))
362 memcpy (result, head, ofs);
363 memcpy (result + ofs, tail, tail_len);
364 result[ofs + tail_len] = '\0';
367 if (recode_string_len (encoding, "UTF-8", result,
368 ofs + tail_len) <= max_len)
370 correct_result = true;
374 correct_result = false;
393 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
394 null-terminated string owned by the caller. HEAD, TAIL, and the returned
395 string are all encoded in UTF-8. As many characters[*] from the beginning
396 of HEAD are included as will fit within MAX_LEN bytes supposing that the
397 resulting string were to be re-encoded in ENCODING. All of TAIL is always
398 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
400 [*] Actually this function drops grapheme clusters instead of characters, so
401 that, e.g. a Unicode character followed by a combining accent character
402 is either completely included or completely excluded from the returned
403 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
404 information on grapheme clusters.
406 A null ENCODING is treated as UTF-8.
408 Simple examples for encoding="UTF-8", max_len=6:
410 head="abc", tail="xyz" => "abcxyz"
411 head="abcd", tail="xyz" => "abcxyz"
412 head="abc", tail="uvwxyz" => "uvwxyz"
413 head="abc", tail="tuvwxyz" => "tuvwxyz"
415 Examples for encoding="ISO-8859-1", max_len=6:
417 head="éèä", tail="xyz" => "éèäxyz"
418 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
419 each take 2 bytes in UTF-8 encoding)
422 utf8_encoding_concat (const char *head, const char *tail,
423 const char *encoding, size_t max_len)
425 size_t tail_len = strlen (tail);
429 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
430 encoding, max_len, &result);
431 return (result != NULL
433 : xconcat2 (head, prefix_len, tail, tail_len));
436 /* Returns the length, in bytes, of the string that would be returned by
437 utf8_encoding_concat() if passed the same arguments, but the implementation
438 is often more efficient. */
440 utf8_encoding_concat_len (const char *head, const char *tail,
441 const char *encoding, size_t max_len)
443 size_t tail_len = strlen (tail);
447 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
448 encoding, max_len, &result);
450 return prefix_len + tail_len;
453 /* Returns an allocated, null-terminated string, owned by the caller,
454 containing as many characters[*] from the beginning of S that would fit
455 within MAX_LEN bytes if the returned string were to be re-encoded in
456 ENCODING. Both S and the returned string are encoded in UTF-8.
458 [*] Actually this function drops grapheme clusters instead of characters, so
459 that, e.g. a Unicode character followed by a combining accent character
460 is either completely included or completely excluded from the returned
461 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
462 information on grapheme clusters.
464 A null ENCODING is treated as UTF-8.
467 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
469 return utf8_encoding_concat (s, "", encoding, max_len);
472 /* Returns the length, in bytes, of the string that would be returned by
473 utf8_encoding_trunc() if passed the same arguments, but the implementation
474 is often more efficient. */
476 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
478 return utf8_encoding_concat_len (s, "", encoding, max_len);
481 /* Returns FILENAME converted from UTF-8 to the filename encoding.
482 On Windows the filename encoding is UTF-8; elsewhere it is based on the
485 utf8_to_filename (const char *filename)
487 return recode_string (filename_encoding (), "UTF-8", filename, -1);
490 /* Returns FILENAME converted from the filename encoding to UTF-8.
491 On Windows the filename encoding is UTF-8; elsewhere it is based on the
494 filename_to_utf8 (const char *filename)
496 return recode_string ("UTF-8", filename_encoding (), filename, -1);
499 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
500 dynamically allocated string in TO-encoding. Any characters which cannot be
501 converted will be represented by '?'.
503 The returned string will be null-terminated and allocated on POOL.
505 This function's behaviour differs from that of g_convert_with_fallback
506 provided by GLib. The GLib function will fail (returns NULL) if any part of
507 the input string is not valid in the declared input encoding. This function
508 however perseveres even in the presence of badly encoded input. */
510 recode_substring_pool (const char *to, const char *from,
511 struct substring text, struct pool *pool)
513 size_t outbufferlength;
517 to = default_encoding;
520 from = default_encoding;
522 conv = create_iconv (to, from);
524 if ( (iconv_t) -1 == conv )
526 struct substring out;
527 ss_alloc_substring_pool (&out, text, pool);
531 for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
532 if ( outbufferlength > text.length)
534 char *output = pool_malloc (pool, outbufferlength);
535 ssize_t output_len = try_recode (conv, text.string, text.length,
536 output, outbufferlength);
538 return ss_buffer (output, output_len);
539 pool_free (pool, output);
548 setlocale (LC_CTYPE, "");
549 setlocale (LC_COLLATE, "");
550 setlocale (LC_MESSAGES, "");
552 setlocale (LC_PAPER, "");
554 bindtextdomain (PACKAGE, relocate(locale_dir));
555 textdomain (PACKAGE);
557 assert (default_encoding == NULL);
558 default_encoding = xstrdup (locale_charset ());
564 get_default_encoding (void)
566 return default_encoding;
570 set_default_encoding (const char *enc)
572 free (default_encoding);
573 default_encoding = xstrdup (enc);
577 /* Attempts to set the encoding from a locale name
578 returns true if successfull.
579 This function does not (should not!) alter the current locale.
582 set_encoding_from_locale (const char *loc)
587 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
589 setlocale (LC_CTYPE, "C");
590 c_encoding = xstrdup (locale_charset ());
592 setlocale (LC_CTYPE, loc);
593 loc_encoding = xstrdup (locale_charset ());
596 if ( 0 == strcmp (loc_encoding, c_encoding))
602 setlocale (LC_CTYPE, tmp);
608 free (default_encoding);
609 default_encoding = loc_encoding;
622 struct hmapx_node *node;
623 struct converter *cvtr;
625 HMAPX_FOR_EACH (cvtr, node, &map)
628 free (cvtr->fromcode);
629 if (cvtr->conv != (iconv_t) -1)
630 iconv_close (cvtr->conv);
634 hmapx_destroy (&map);
636 free (default_encoding);
637 default_encoding = NULL;
643 valid_encoding (const char *enc)
645 iconv_t conv = iconv_open (UTF8, enc);
647 if ( conv == (iconv_t) -1)
656 /* Return the system local's idea of the
657 decimal seperator character */
659 get_system_decimal (void)
663 char *ol = xstrdup (setlocale (LC_NUMERIC, NULL));
664 setlocale (LC_NUMERIC, "");
667 radix_char = nl_langinfo (RADIXCHAR)[0];
671 snprintf (buf, sizeof buf, "%f", 2.5);
676 /* We MUST leave LC_NUMERIC untouched, since it would
677 otherwise interfere with data_{in,out} */
678 setlocale (LC_NUMERIC, ol);
684 uc_name (ucs4_t uc, char buffer[16])
686 if (uc >= 0x20 && uc < 0x7f)
687 snprintf (buffer, 16, "`%c'", uc);
689 snprintf (buffer, 16, "U+%04X", uc);
694 get_encoding_info (struct encoding_info *e, const char *name)
696 const struct substring in = SS_LITERAL_INITIALIZER (
698 "!\"#$%&'()*+,-./0123456789:;<=>?@"
699 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
700 "abcdefghijklmnopqrstuvwxyz{|}~");
702 struct substring out, cr, lf, space;
705 memset (e, 0, sizeof *e);
707 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
708 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
709 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
711 && cr.length <= MAX_UNIT
712 && cr.length == lf.length
713 && cr.length == space.length);
716 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
720 ss_alloc_substring (&cr, ss_cstr ("\r"));
721 ss_alloc_substring (&lf, ss_cstr ("\n"));
722 ss_alloc_substring (&space, ss_cstr (" "));
726 memcpy (e->cr, cr.string, e->unit);
727 memcpy (e->lf, lf.string, e->unit);
728 memcpy (e->space, space.string, e->unit);
734 out = recode_substring_pool ("UTF-8", name, in, NULL);
735 e->is_ascii_compatible = ss_equals (in, out);
738 if (!e->is_ascii_compatible && e->unit == 1)
740 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
741 e->is_ebcdic_compatible = (out.length == 1
742 && (uint8_t) out.string[0] == 0xc1);
746 e->is_ebcdic_compatible = false;
752 is_encoding_ascii_compatible (const char *encoding)
754 struct encoding_info e;
756 get_encoding_info (&e, encoding);
757 return e.is_ascii_compatible;
761 is_encoding_ebcdic_compatible (const char *encoding)
763 struct encoding_info e;
765 get_encoding_info (&e, encoding);
766 return e.is_ebcdic_compatible;
769 /* Returns true if iconv can convert ENCODING to and from UTF-8,
772 is_encoding_supported (const char *encoding)
774 return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
775 && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
778 /* Returns true if E is the name of a UTF-8 encoding.
780 XXX Possibly we should test not E as a string but its properties via
783 is_encoding_utf8 (const char *e)
785 return ((e[0] == 'u' || e[0] == 'U')
786 && (e[1] == 't' || e[1] == 'T')
787 && (e[2] == 'f' || e[2] == 'F')
788 && ((e[3] == '8' && e[4] == '\0')
789 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
792 static struct encoding_category *categories;
793 static int n_categories;
795 static void SENTINEL (0)
796 add_category (size_t *allocated_categories, const char *category, ...)
798 struct encoding_category *c;
799 const char *encodings[16];
803 /* Count encoding arguments. */
804 va_start (args, category);
806 while ((encodings[n] = va_arg (args, const char *)) != NULL)
808 const char *encoding = encodings[n];
809 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
812 assert (n < sizeof encodings / sizeof *encodings);
818 if (n_categories >= *allocated_categories)
819 categories = x2nrealloc (categories,
820 allocated_categories, sizeof *categories);
822 c = &categories[n_categories++];
823 c->category = category;
824 c->encodings = xmalloc (n * sizeof *c->encodings);
825 for (i = 0; i < n; i++)
826 c->encodings[i] = encodings[i];
831 init_encoding_categories (void)
841 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
842 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
843 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
845 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
846 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
847 "Windows-1257", NULL_SENTINEL);
848 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
849 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
850 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
851 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
852 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
853 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
854 "EUC-TW", NULL_SENTINEL);
855 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
856 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
857 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
858 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
859 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
861 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
862 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
863 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
864 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
865 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
867 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
868 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
869 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
870 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
872 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
874 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
875 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
877 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
878 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
880 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
882 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
883 "Windows-1258", NULL_SENTINEL);
884 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
885 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
888 /* Returns an array of "struct encoding_category" that contains only the
889 categories and encodings that the system supports. */
890 struct encoding_category *
891 get_encoding_categories (void)
893 init_encoding_categories ();
897 /* Returns the number of elements in the array returned by
898 get_encoding_categories(). */
900 get_n_encoding_categories (void)
902 init_encoding_categories ();