1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
3 2016, 2021 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "libpspp/i18n.h"
33 #include "libpspp/assertion.h"
34 #include "libpspp/compiler.h"
35 #include "libpspp/hmapx.h"
36 #include "libpspp/hash-functions.h"
37 #include "libpspp/pool.h"
38 #include "libpspp/str.h"
39 #include "libpspp/version.h"
41 #include "gl/c-ctype.h"
42 #include "gl/c-strcase.h"
43 #include "gl/localcharset.h"
44 #include <gl/localename.h>
45 #include "gl/minmax.h"
46 #include "gl/xalloc.h"
47 #include "gl/relocatable.h"
48 #include "gl/xstrndup.h"
51 #define _(msgid) gettext (msgid)
61 static char *default_encoding;
62 static struct hmapx map;
64 /* A wrapper around iconv_open */
65 static struct converter *
66 create_iconv (const char* tocode, const char* fromcode, bool warn)
69 struct hmapx_node *node;
70 struct converter *converter;
73 hash = hash_string (tocode, hash_string (fromcode, 0));
74 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
79 if (!strcmp (tocode, converter->tocode)
80 && !strcmp (fromcode, converter->fromcode))
84 converter = xmalloc (sizeof *converter);
85 converter->tocode = xstrdup (tocode);
86 converter->fromcode = xstrdup (fromcode);
87 converter->conv = iconv_open (tocode, fromcode);
88 int error = converter->conv == (iconv_t) ~0 ? errno : 0;
89 /* I don't think it's safe to translate this string or to use messaging
90 as the converters have not yet been set up */
91 if (error && strcmp (tocode, fromcode))
96 "cannot create a converter for `%s' to `%s': %s\n",
97 fromcode, tocode, strerror (error));
99 free (converter->tocode);
100 free (converter->fromcode);
103 hmapx_insert (&map, NULL, hash);
107 /* Find out how many bytes there are in a null char in the target
109 iconv_t bconv = iconv_open (tocode, "ASCII");
110 if (bconv != (iconv_t) -1)
112 ICONV_CONST char inbuf[1] = "";
113 ICONV_CONST char *inptr = inbuf;
114 size_t inbytes = sizeof inbuf;
117 char *outptr = outbuf;
118 size_t outbytes = sizeof outbuf;
119 if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
120 converter->null_char_width = outptr - outbuf;
124 hmapx_insert (&map, converter, hash);
130 /* Converts the single byte C from encoding FROM to TO, returning the first
133 This function probably shouldn't be used at all, but some code still does
136 recode_byte (const char *to, const char *from, char c)
139 char *s = recode_string (to, from, &c, 1);
145 /* Similar to recode_string_pool, but allocates the returned value on the heap
146 instead of in a pool. It is the caller's responsibility to free the
149 recode_string (const char *to, const char *from,
150 const char *text, int length)
152 return recode_string_pool (to, from, text, length, NULL);
155 /* Returns the length, in bytes, of the string that a similar recode_string()
156 call would return. */
158 recode_string_len (const char *to, const char *from,
159 const char *text, int length)
161 char *s = recode_string (to, from, text, length);
162 size_t len = strlen (s);
167 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
168 at OP, and appends a null terminator to the output.
170 Returns the output length if successful, -1 if the output buffer is too
173 try_recode (struct converter *cvtr, char fallbackchar,
174 const char *in, size_t inbytes,
175 char *out_, size_t outbytes)
180 int null_bytes = cvtr->null_char_width;
182 /* Put the converter into the initial shift state, in case there was any
183 state information left over from its last usage. */
184 iconv (cvtr->conv, NULL, 0, NULL, 0);
186 /* Do two rounds of iconv() calls:
188 - The first round does the bulk of the conversion using the
189 caller-supplied input data..
191 - The second round flushes any leftover output. This has a real effect
192 with input encodings that use combining diacritics, e.g. without the
193 second round the last character tends to gets dropped when converting
194 from windows-1258 to other encodings.
196 for (i = 0; i < 2; i++)
198 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
199 size_t *inbytesp = i ? NULL : &inbytes;
201 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
205 if (outbytes < null_bytes + 1)
209 *out++ = fallbackchar;
210 for (j = 0 ; j < null_bytes ; ++j)
212 return out - 1 - out_;
219 *out++ = fallbackchar;
232 /* should never happen */
233 fprintf (stderr, "Character conversion error: %s\n",
240 if (outbytes <= null_bytes - 1)
243 for (i = 0 ; i < null_bytes ; ++i)
246 return out - 1 - out_;
249 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
250 dynamically allocated string in TO-encoding. Any characters which cannot be
251 converted will be represented by '?'.
253 LENGTH should be the length of the string or -1, if null terminated.
255 The returned string will be allocated on POOL.
257 This function's behaviour differs from that of g_convert_with_fallback
258 provided by GLib. The GLib function will fail (returns NULL) if any part of
259 the input string is not valid in the declared input encoding. This function
260 however perseveres even in the presence of badly encoded input. */
262 recode_string_pool (const char *to, const char *from,
263 const char *text, int length, struct pool *pool)
265 struct substring out;
271 length = strlen (text);
273 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
277 /* Returns the name of the encoding that should be used for file names.
279 This is meant to be the same encoding used by g_filename_from_uri() and
280 g_filename_to_uri() in GLib. */
282 filename_encoding (void)
284 #if defined _WIN32 || defined __WIN32__
287 return locale_charset ();
292 xconcat2 (const char *a, size_t a_len,
293 const char *b, size_t b_len)
295 char *s = xmalloc (a_len + b_len + 1);
296 memcpy (s, a, a_len);
297 memcpy (s + a_len, b, b_len);
298 s[a_len + b_len] = '\0';
302 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
303 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
304 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
305 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
306 HEAD and tries again, repeating as necessary until the concatenated result
307 fits or until HEAD_LEN reaches 0.
309 [*] Actually this function drops grapheme clusters instead of characters, so
310 that, e.g. a Unicode character followed by a combining accent character
311 is either completely included or completely excluded from HEAD_LEN. See
312 UAX #29 at http://unicode.org/reports/tr29/ for more information on
315 A null ENCODING is treated as UTF-8.
317 Sometimes this function has to actually construct the concatenated string to
318 measure its length. When this happens, it sets *RESULTP to that
319 null-terminated string, allocated with malloc(), for the caller to use if it
320 needs it. Otherwise, it sets *RESULTP to NULL.
322 Simple examples for encoding="UTF-8", max_len=6:
324 head="abc", tail="xyz" => 3
325 head="abcd", tail="xyz" => 3 ("d" dropped).
326 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
327 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
329 Examples for encoding="ISO-8859-1", max_len=6:
331 head="éèä", tail="xyz" => 6
332 (each letter in head is only 1 byte in ISO-8859-1 even though they
333 each take 2 bytes in UTF-8 encoding)
336 utf8_encoding_concat__ (const char *head, size_t head_len,
337 const char *tail, size_t tail_len,
338 const char *encoding, size_t max_len,
344 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
346 if (head_len + tail_len <= max_len)
348 else if (tail_len >= max_len)
358 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
360 ofs <= max_len - tail_len;
365 mblen = u8_mbtouc (&next,
366 CHAR_CAST (const uint8_t *, head + ofs),
368 if (uc_is_grapheme_break (prev, next))
381 result = (tail_len > 0
382 ? xconcat2 (head, head_len, tail, tail_len)
383 : CONST_CAST (char *, head));
384 if (recode_string_len (encoding, "UTF-8", result,
385 head_len + tail_len) <= max_len)
387 *resultp = result != head ? result : NULL;
392 bool correct_result = false;
399 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
406 mblen = u8_mbtouc (&next,
407 CHAR_CAST (const uint8_t *, head + ofs),
409 if (uc_is_grapheme_break (prev, next))
413 memcpy (result, head, ofs);
414 memcpy (result + ofs, tail, tail_len);
415 result[ofs + tail_len] = '\0';
418 if (recode_string_len (encoding, "UTF-8", result,
419 ofs + tail_len) <= max_len)
421 correct_result = true;
425 correct_result = false;
444 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
445 null-terminated string owned by the caller. HEAD, TAIL, and the returned
446 string are all encoded in UTF-8. As many characters[*] from the beginning
447 of HEAD are included as will fit within MAX_LEN bytes supposing that the
448 resulting string were to be re-encoded in ENCODING. All of TAIL is always
449 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
451 [*] Actually this function drops grapheme clusters instead of characters, so
452 that, e.g. a Unicode character followed by a combining accent character
453 is either completely included or completely excluded from the returned
454 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
455 information on grapheme clusters.
457 A null ENCODING is treated as UTF-8.
459 Simple examples for encoding="UTF-8", max_len=6:
461 head="abc", tail="xyz" => "abcxyz"
462 head="abcd", tail="xyz" => "abcxyz"
463 head="abc", tail="uvwxyz" => "uvwxyz"
464 head="abc", tail="tuvwxyz" => "tuvwxyz"
466 Examples for encoding="ISO-8859-1", max_len=6:
468 head="éèä", tail="xyz" => "éèäxyz"
469 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
470 each take 2 bytes in UTF-8 encoding)
473 utf8_encoding_concat (const char *head, const char *tail,
474 const char *encoding, size_t max_len)
476 size_t tail_len = strlen (tail);
480 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
481 encoding, max_len, &result);
482 return (result != NULL
484 : xconcat2 (head, prefix_len, tail, tail_len));
487 /* Returns the length, in bytes, of the string that would be returned by
488 utf8_encoding_concat() if passed the same arguments, but the implementation
489 is often more efficient. */
491 utf8_encoding_concat_len (const char *head, const char *tail,
492 const char *encoding, size_t max_len)
494 size_t tail_len = strlen (tail);
498 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
499 encoding, max_len, &result);
501 return prefix_len + tail_len;
504 /* Returns an allocated, null-terminated string, owned by the caller,
505 containing as many characters[*] from the beginning of S that would fit
506 within MAX_LEN bytes if the returned string were to be re-encoded in
507 ENCODING. Both S and the returned string are encoded in UTF-8.
509 [*] Actually this function drops grapheme clusters instead of characters, so
510 that, e.g. a Unicode character followed by a combining accent character
511 is either completely included or completely excluded from the returned
512 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
513 information on grapheme clusters.
515 A null ENCODING is treated as UTF-8.
518 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
520 return utf8_encoding_concat (s, "", encoding, max_len);
523 /* Returns the length, in bytes, of the string that would be returned by
524 utf8_encoding_trunc() if passed the same arguments, but the implementation
525 is often more efficient. */
527 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
529 return utf8_encoding_concat_len (s, "", encoding, max_len);
532 /* Returns FILENAME converted from UTF-8 to the filename encoding.
533 On Windows the filename encoding is UTF-8; elsewhere it is based on the
536 utf8_to_filename (const char *filename)
538 return recode_string (filename_encoding (), "UTF-8", filename, -1);
541 /* Returns FILENAME converted from the filename encoding to UTF-8.
542 On Windows the filename encoding is UTF-8; elsewhere it is based on the
545 filename_to_utf8 (const char *filename)
547 return recode_string ("UTF-8", filename_encoding (), filename, -1);
551 recode_substring_pool__ (const char *to, const char *from,
552 struct substring text, char fallbackchar,
553 struct pool *pool, struct substring *out)
556 struct converter *conv;
559 to = default_encoding;
562 from = default_encoding;
564 conv = create_iconv (to, from, true);
570 out->string = pool_malloc (pool, text.length + 1);
571 out->length = text.length;
572 memcpy (out->string, text.string, text.length);
573 out->string[out->length] = '\0';
580 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
582 char *output = pool_malloc (pool, bufsize);
585 retval = try_recode (conv, fallbackchar, text.string, text.length,
589 *out = ss_buffer (output, retval);
592 pool_free (pool, output);
594 if (retval != -E2BIG)
601 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
602 dynamically allocated string in TO-encoding. Any characters which cannot be
603 converted will be represented by '?'.
605 The returned string will be null-terminated and allocated on POOL with
608 This function's behaviour differs from that of g_convert_with_fallback
609 provided by GLib. The GLib function will fail (returns NULL) if any part of
610 the input string is not valid in the declared input encoding. This function
611 however perseveres even in the presence of badly encoded input. */
613 recode_substring_pool (const char *to, const char *from,
614 struct substring text, struct pool *pool)
616 struct substring out;
618 recode_substring_pool__ (to, from, text, '?', pool, &out);
622 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
623 dynamically allocated string in TO-encoding. On success, returns 0, and the
624 converted null-terminated string, allocated from POOL with pool_malloc(), is
625 stored in *OUT. On failure, returns a positive errno value.
627 The function fails with an error if any part of the input string is not
628 valid in the declared input encoding. */
630 recode_pedantically (const char *to, const char *from,
631 struct substring text, struct pool *pool,
632 struct substring *out)
636 error = recode_substring_pool__ (to, from, text, 0, pool, out);
645 setlocale (LC_ALL, "");
647 bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
649 textdomain (PACKAGE);
651 assert (default_encoding == NULL);
652 default_encoding = xstrdup (locale_charset ());
658 get_default_encoding (void)
660 return default_encoding;
664 set_default_encoding (const char *enc)
666 free (default_encoding);
667 default_encoding = xstrdup (enc);
670 /* Return the ISO two letter code for the current LC_MESSAGES
675 const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
676 if (0 == strcmp (localename, "C"))
678 char *ln = xstrdup (localename);
679 char *end = strchr (ln, '_');
686 /* Attempts to set the encoding from a locale name
687 returns true if successful.
688 This function does not (should not!) alter the current locale.
691 set_encoding_from_locale (const char *loc)
696 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
698 setlocale (LC_CTYPE, "C");
699 c_encoding = xstrdup (locale_charset ());
701 setlocale (LC_CTYPE, loc);
702 loc_encoding = xstrdup (locale_charset ());
705 if (0 == strcmp (loc_encoding, c_encoding))
710 setlocale (LC_CTYPE, tmp);
716 free (default_encoding);
717 default_encoding = loc_encoding;
730 struct hmapx_node *node;
731 struct converter *cvtr;
733 HMAPX_FOR_EACH (cvtr, node, &map)
738 free (cvtr->fromcode);
739 if (cvtr->conv != (iconv_t) -1)
740 iconv_close (cvtr->conv);
744 hmapx_destroy (&map);
746 free (default_encoding);
747 default_encoding = NULL;
753 valid_encoding (const char *enc)
755 iconv_t conv = iconv_open (UTF8, enc);
757 if (conv == (iconv_t) -1)
766 /* Return the system local's idea of the
767 decimal separator character */
769 get_system_decimal (void)
774 radix_char = nl_langinfo (RADIXCHAR)[0];
778 snprintf (buf, sizeof buf, "%f", 2.5);
787 uc_name (ucs4_t uc, char buffer[16])
789 if (uc >= 0x20 && uc < 0x7f)
790 snprintf (buffer, 16, "`%c'", uc);
792 snprintf (buffer, 16, "U+%04X", uc);
796 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
798 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
799 with lowercase and uppercase letters treated as equal, starting from
802 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
804 uint8_t folded_buf[2048];
805 size_t folded_len = sizeof folded_buf;
809 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
810 NULL, UNINORM_NFKD, folded_buf, &folded_len);
811 if (folded_s != NULL)
813 hash = hash_bytes (folded_s, folded_len, basis);
814 if (folded_s != folded_buf)
821 hash = hash_bytes (s, n, basis);
827 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
828 uppercase letters treated as equal, starting from BASIS. */
830 utf8_hash_case_string (const char *s, unsigned int basis)
832 return utf8_hash_case_substring (ss_cstr (s), basis);
835 /* Returns a hash value for UTF-8 string S, with lowercase and uppercase
836 letters treated as equal, starting from BASIS. */
838 utf8_hash_case_substring (struct substring s, unsigned int basis)
840 return utf8_hash_case_bytes (s.string, s.length, basis);
843 /* Compares UTF-8 strings A and B case-insensitively.
844 Returns a negative value if A < B, zero if A == B, positive if A > B. */
846 utf8_strcasecmp (const char *a, const char *b)
848 return utf8_sscasecmp (ss_cstr (a), ss_cstr (b));
852 utf8_sscasecmp (struct substring a, struct substring b)
854 return utf8_strncasecmp (a.string, a.length, b.string, b.length);
857 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
859 Returns a negative value if A < B, zero if A == B, positive if A > B. */
861 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
865 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
866 CHAR_CAST (const uint8_t *, b), bn,
867 NULL, UNINORM_NFKD, &result))
872 result = memcmp (a, b, MIN (an, bn));
874 result = an < bn ? -1 : an > bn;
881 is_all_digits (const uint8_t *s, size_t len)
883 for (size_t i = 0; i < len; i++)
884 if (!c_isdigit (s[i]))
889 /* Compares UTF-8 strings A and B case-insensitively. If the strings end in a
890 number, then they are compared numerically. Returns a negative value if A <
891 B, zero if A == B, positive if A > B. */
893 utf8_strverscasecmp (const char *a, const char *b)
897 size_t a_len = sizeof a_stub;
898 uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
899 UNINORM_NFKD, a_stub, &a_len);
903 size_t b_len = sizeof b_stub;
904 uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
905 UNINORM_NFKD, b_stub, &b_len);
908 if (!a_norm || !b_norm)
910 result = strcmp (a, b);
914 size_t len = MIN (a_len, b_len);
915 for (size_t i = 0; i < len; i++)
916 if (a_norm[i] != b_norm[i])
918 /* If both strings end in digits, compare them numerically. */
919 if (is_all_digits (&a_norm[i], a_len - i)
920 && is_all_digits (&b_norm[i], b_len - i))
922 /* Start by stripping leading zeros, since those don't matter for
923 numerical comparison. */
925 for (ap = i; ap < a_len; ap++)
926 if (a_norm[ap] != '0')
928 for (bp = i; bp < b_len; bp++)
929 if (b_norm[bp] != '0')
932 /* The number with more digits, if there is one, is larger. */
933 size_t a_digits = a_len - ap;
934 size_t b_digits = b_len - bp;
935 if (a_digits != b_digits)
936 result = a_digits > b_digits ? 1 : -1;
938 result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
941 result = a_norm[i] > b_norm[i] ? 1 : -1;
944 result = a_len < b_len ? -1 : a_len > b_len;
947 if (a_norm != a_stub)
949 if (b_norm != b_stub)
955 utf8_casemap (const char *s,
956 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
957 uint8_t *, size_t *))
962 result = CHAR_CAST (char *,
963 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
964 NULL, NULL, NULL, &size));
970 result = xstrdup (s);
976 utf8_to_upper (const char *s)
978 return utf8_casemap (s, u8_toupper);
982 utf8_to_lower (const char *s)
984 return utf8_casemap (s, u8_tolower);
988 utf8_to_title (const char *s)
990 return utf8_casemap (s, u8_totitle);
994 get_encoding_info (struct encoding_info *e, const char *name)
996 const struct substring in = SS_LITERAL_INITIALIZER (
998 "!\"#$%&'()*+,-./0123456789:;<=>?@"
999 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
1000 "abcdefghijklmnopqrstuvwxyz{|}~");
1002 struct substring out, cr, lf, space;
1005 memset (e, 0, sizeof *e);
1007 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
1008 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
1009 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
1010 ok = (cr.length >= 1
1011 && cr.length <= MAX_UNIT
1012 && cr.length == lf.length
1013 && cr.length == space.length);
1016 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1019 ss_dealloc (&space);
1020 ss_alloc_substring (&cr, ss_cstr ("\r"));
1021 ss_alloc_substring (&lf, ss_cstr ("\n"));
1022 ss_alloc_substring (&space, ss_cstr (" "));
1025 e->unit = cr.length;
1026 memcpy (e->cr, cr.string, e->unit);
1027 memcpy (e->lf, lf.string, e->unit);
1028 memcpy (e->space, space.string, e->unit);
1032 ss_dealloc (&space);
1034 out = recode_substring_pool ("UTF-8", name, in, NULL);
1035 e->is_ascii_compatible = ss_equals (in, out);
1038 if (!e->is_ascii_compatible && e->unit == 1)
1040 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1041 e->is_ebcdic_compatible = (out.length == 1
1042 && (uint8_t) out.string[0] == 0xc1);
1046 e->is_ebcdic_compatible = false;
1052 is_encoding_ascii_compatible (const char *encoding)
1054 struct encoding_info e;
1056 get_encoding_info (&e, encoding);
1057 return e.is_ascii_compatible;
1061 is_encoding_ebcdic_compatible (const char *encoding)
1063 struct encoding_info e;
1065 get_encoding_info (&e, encoding);
1066 return e.is_ebcdic_compatible;
1069 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1072 is_encoding_supported (const char *encoding)
1074 return (create_iconv ("UTF-8", encoding, false)
1075 && create_iconv (encoding, "UTF-8", false));
1078 /* Returns true if E is the name of a UTF-8 encoding.
1080 XXX Possibly we should test not E as a string but its properties via
1083 is_encoding_utf8 (const char *e)
1085 return ((e[0] == 'u' || e[0] == 'U')
1086 && (e[1] == 't' || e[1] == 'T')
1087 && (e[2] == 'f' || e[2] == 'F')
1088 && ((e[3] == '8' && e[4] == '\0')
1089 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1092 static struct encoding_category *categories;
1093 static int n_categories;
1095 static void SENTINEL (0)
1096 add_category (size_t *allocated_categories, const char *category, ...)
1098 struct encoding_category *c;
1099 const char *encodings[16];
1103 /* Count encoding arguments. */
1104 va_start (args, category);
1106 while ((encodings[n] = va_arg (args, const char *)) != NULL)
1108 const char *encoding = encodings[n];
1109 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1112 assert (n < sizeof encodings / sizeof *encodings);
1118 if (n_categories >= *allocated_categories)
1119 categories = x2nrealloc (categories,
1120 allocated_categories, sizeof *categories);
1122 c = &categories[n_categories++];
1123 c->category = category;
1124 c->encodings = xmalloc (n * sizeof *c->encodings);
1125 for (i = 0; i < n; i++)
1126 c->encodings[i] = encodings[i];
1131 init_encoding_categories (void)
1141 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1142 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1143 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1145 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1146 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1147 "Windows-1257", NULL_SENTINEL);
1148 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1149 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1150 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1151 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1152 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1153 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1154 "EUC-TW", NULL_SENTINEL);
1155 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1156 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1157 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1158 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1159 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1161 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1162 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1163 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1164 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1165 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1167 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1168 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1169 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1170 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1172 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1174 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1175 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1177 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1178 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1180 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1182 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1183 "Windows-1258", NULL_SENTINEL);
1184 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1185 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1188 /* Returns an array of "struct encoding_category" that contains only the
1189 categories and encodings that the system supports. */
1190 struct encoding_category *
1191 get_encoding_categories (void)
1193 init_encoding_categories ();
1197 /* Returns the number of elements in the array returned by
1198 get_encoding_categories(). */
1200 get_n_encoding_categories (void)
1202 init_encoding_categories ();
1203 return n_categories;