1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/i18n.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/compiler.h"
34 #include "libpspp/hmapx.h"
35 #include "libpspp/hash-functions.h"
36 #include "libpspp/pool.h"
37 #include "libpspp/str.h"
38 #include "libpspp/version.h"
40 #include "gl/c-ctype.h"
41 #include "gl/c-strcase.h"
42 #include "gl/localcharset.h"
43 #include <gl/localename.h>
44 #include "gl/minmax.h"
45 #include "gl/xalloc.h"
46 #include "gl/relocatable.h"
47 #include "gl/xstrndup.h"
50 #define _(msgid) gettext (msgid)
60 static char *default_encoding;
61 static struct hmapx map;
63 /* A wrapper around iconv_open */
64 static struct converter *
65 create_iconv (const char* tocode, const char* fromcode)
68 struct hmapx_node *node;
69 struct converter *converter;
72 hash = hash_string (tocode, hash_string (fromcode, 0));
73 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
78 if (!strcmp (tocode, converter->tocode)
79 && !strcmp (fromcode, converter->fromcode))
83 converter = xmalloc (sizeof *converter);
84 converter->tocode = xstrdup (tocode);
85 converter->fromcode = xstrdup (fromcode);
86 converter->conv = iconv_open (tocode, fromcode);
87 int error = converter->conv == (iconv_t) ~0 ? errno : 0;
88 /* I don't think it's safe to translate this string or to use messaging
89 as the converters have not yet been set up */
90 if (error && strcmp (tocode, fromcode))
94 "cannot create a converter for `%s' to `%s': %s\n",
95 fromcode, tocode, strerror (error));
97 free (converter->tocode);
98 free (converter->fromcode);
101 hmapx_insert (&map, NULL, hash);
105 /* Find out how many bytes there are in a null char in the target
107 iconv_t bconv = iconv_open (tocode, "ASCII");
108 if (bconv != (iconv_t) -1)
110 ICONV_CONST char inbuf[1] = "";
111 ICONV_CONST char *inptr = inbuf;
112 size_t inbytes = sizeof inbuf;
115 char *outptr = outbuf;
116 size_t outbytes = sizeof outbuf;
117 if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
118 converter->null_char_width = outptr - outbuf;
122 hmapx_insert (&map, converter, hash);
128 /* Converts the single byte C from encoding FROM to TO, returning the first
131 This function probably shouldn't be used at all, but some code still does
134 recode_byte (const char *to, const char *from, char c)
137 char *s = recode_string (to, from, &c, 1);
143 /* Similar to recode_string_pool, but allocates the returned value on the heap
144 instead of in a pool. It is the caller's responsibility to free the
147 recode_string (const char *to, const char *from,
148 const char *text, int length)
150 return recode_string_pool (to, from, text, length, NULL);
153 /* Returns the length, in bytes, of the string that a similar recode_string()
154 call would return. */
156 recode_string_len (const char *to, const char *from,
157 const char *text, int length)
159 char *s = recode_string (to, from, text, length);
160 size_t len = strlen (s);
165 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
166 at OP, and appends a null terminator to the output.
168 Returns the output length if successful, -1 if the output buffer is too
171 try_recode (struct converter *cvtr, char fallbackchar,
172 const char *in, size_t inbytes,
173 char *out_, size_t outbytes)
178 int null_bytes = cvtr->null_char_width;
180 /* Put the converter into the initial shift state, in case there was any
181 state information left over from its last usage. */
182 iconv (cvtr->conv, NULL, 0, NULL, 0);
184 /* Do two rounds of iconv() calls:
186 - The first round does the bulk of the conversion using the
187 caller-supplied input data..
189 - The second round flushes any leftover output. This has a real effect
190 with input encodings that use combining diacritics, e.g. without the
191 second round the last character tends to gets dropped when converting
192 from windows-1258 to other encodings.
194 for (i = 0; i < 2; i++)
196 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
197 size_t *inbytesp = i ? NULL : &inbytes;
199 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
203 if (outbytes < null_bytes + 1)
207 *out++ = fallbackchar;
208 for (j = 0 ; j < null_bytes ; ++j)
210 return out - 1 - out_;
217 *out++ = fallbackchar;
230 /* should never happen */
231 fprintf (stderr, "Character conversion error: %s\n",
238 if (outbytes <= null_bytes - 1)
241 for (i = 0 ; i < null_bytes ; ++i)
244 return out - 1 - out_;
247 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
248 dynamically allocated string in TO-encoding. Any characters which cannot be
249 converted will be represented by '?'.
251 LENGTH should be the length of the string or -1, if null terminated.
253 The returned string will be allocated on POOL.
255 This function's behaviour differs from that of g_convert_with_fallback
256 provided by GLib. The GLib function will fail (returns NULL) if any part of
257 the input string is not valid in the declared input encoding. This function
258 however perseveres even in the presence of badly encoded input. */
260 recode_string_pool (const char *to, const char *from,
261 const char *text, int length, struct pool *pool)
263 struct substring out;
269 length = strlen (text);
271 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
275 /* Returns the name of the encoding that should be used for file names.
277 This is meant to be the same encoding used by g_filename_from_uri() and
278 g_filename_to_uri() in GLib. */
280 filename_encoding (void)
282 #if defined _WIN32 || defined __WIN32__
285 return locale_charset ();
290 xconcat2 (const char *a, size_t a_len,
291 const char *b, size_t b_len)
293 char *s = xmalloc (a_len + b_len + 1);
294 memcpy (s, a, a_len);
295 memcpy (s + a_len, b, b_len);
296 s[a_len + b_len] = '\0';
300 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
301 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
302 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
303 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
304 HEAD and tries again, repeating as necessary until the concatenated result
305 fits or until HEAD_LEN reaches 0.
307 [*] Actually this function drops grapheme clusters instead of characters, so
308 that, e.g. a Unicode character followed by a combining accent character
309 is either completely included or completely excluded from HEAD_LEN. See
310 UAX #29 at http://unicode.org/reports/tr29/ for more information on
313 A null ENCODING is treated as UTF-8.
315 Sometimes this function has to actually construct the concatenated string to
316 measure its length. When this happens, it sets *RESULTP to that
317 null-terminated string, allocated with malloc(), for the caller to use if it
318 needs it. Otherwise, it sets *RESULTP to NULL.
320 Simple examples for encoding="UTF-8", max_len=6:
322 head="abc", tail="xyz" => 3
323 head="abcd", tail="xyz" => 3 ("d" dropped).
324 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
325 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
327 Examples for encoding="ISO-8859-1", max_len=6:
329 head="éèä", tail="xyz" => 6
330 (each letter in head is only 1 byte in ISO-8859-1 even though they
331 each take 2 bytes in UTF-8 encoding)
334 utf8_encoding_concat__ (const char *head, size_t head_len,
335 const char *tail, size_t tail_len,
336 const char *encoding, size_t max_len,
342 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
344 if (head_len + tail_len <= max_len)
346 else if (tail_len >= max_len)
356 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
358 ofs <= max_len - tail_len;
363 mblen = u8_mbtouc (&next,
364 CHAR_CAST (const uint8_t *, head + ofs),
366 if (uc_is_grapheme_break (prev, next))
379 result = (tail_len > 0
380 ? xconcat2 (head, head_len, tail, tail_len)
381 : CONST_CAST (char *, head));
382 if (recode_string_len (encoding, "UTF-8", result,
383 head_len + tail_len) <= max_len)
385 *resultp = result != head ? result : NULL;
390 bool correct_result = false;
397 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
404 mblen = u8_mbtouc (&next,
405 CHAR_CAST (const uint8_t *, head + ofs),
407 if (uc_is_grapheme_break (prev, next))
411 memcpy (result, head, ofs);
412 memcpy (result + ofs, tail, tail_len);
413 result[ofs + tail_len] = '\0';
416 if (recode_string_len (encoding, "UTF-8", result,
417 ofs + tail_len) <= max_len)
419 correct_result = true;
423 correct_result = false;
442 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
443 null-terminated string owned by the caller. HEAD, TAIL, and the returned
444 string are all encoded in UTF-8. As many characters[*] from the beginning
445 of HEAD are included as will fit within MAX_LEN bytes supposing that the
446 resulting string were to be re-encoded in ENCODING. All of TAIL is always
447 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
449 [*] Actually this function drops grapheme clusters instead of characters, so
450 that, e.g. a Unicode character followed by a combining accent character
451 is either completely included or completely excluded from the returned
452 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
453 information on grapheme clusters.
455 A null ENCODING is treated as UTF-8.
457 Simple examples for encoding="UTF-8", max_len=6:
459 head="abc", tail="xyz" => "abcxyz"
460 head="abcd", tail="xyz" => "abcxyz"
461 head="abc", tail="uvwxyz" => "uvwxyz"
462 head="abc", tail="tuvwxyz" => "tuvwxyz"
464 Examples for encoding="ISO-8859-1", max_len=6:
466 head="éèä", tail="xyz" => "éèäxyz"
467 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
468 each take 2 bytes in UTF-8 encoding)
471 utf8_encoding_concat (const char *head, const char *tail,
472 const char *encoding, size_t max_len)
474 size_t tail_len = strlen (tail);
478 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
479 encoding, max_len, &result);
480 return (result != NULL
482 : xconcat2 (head, prefix_len, tail, tail_len));
485 /* Returns the length, in bytes, of the string that would be returned by
486 utf8_encoding_concat() if passed the same arguments, but the implementation
487 is often more efficient. */
489 utf8_encoding_concat_len (const char *head, const char *tail,
490 const char *encoding, size_t max_len)
492 size_t tail_len = strlen (tail);
496 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
497 encoding, max_len, &result);
499 return prefix_len + tail_len;
502 /* Returns an allocated, null-terminated string, owned by the caller,
503 containing as many characters[*] from the beginning of S that would fit
504 within MAX_LEN bytes if the returned string were to be re-encoded in
505 ENCODING. Both S and the returned string are encoded in UTF-8.
507 [*] Actually this function drops grapheme clusters instead of characters, so
508 that, e.g. a Unicode character followed by a combining accent character
509 is either completely included or completely excluded from the returned
510 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
511 information on grapheme clusters.
513 A null ENCODING is treated as UTF-8.
516 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
518 return utf8_encoding_concat (s, "", encoding, max_len);
521 /* Returns the length, in bytes, of the string that would be returned by
522 utf8_encoding_trunc() if passed the same arguments, but the implementation
523 is often more efficient. */
525 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
527 return utf8_encoding_concat_len (s, "", encoding, max_len);
530 /* Returns FILENAME converted from UTF-8 to the filename encoding.
531 On Windows the filename encoding is UTF-8; elsewhere it is based on the
534 utf8_to_filename (const char *filename)
536 return recode_string (filename_encoding (), "UTF-8", filename, -1);
539 /* Returns FILENAME converted from the filename encoding to UTF-8.
540 On Windows the filename encoding is UTF-8; elsewhere it is based on the
543 filename_to_utf8 (const char *filename)
545 return recode_string ("UTF-8", filename_encoding (), filename, -1);
549 recode_substring_pool__ (const char *to, const char *from,
550 struct substring text, char fallbackchar,
551 struct pool *pool, struct substring *out)
554 struct converter *conv;
557 to = default_encoding;
560 from = default_encoding;
562 conv = create_iconv (to, from);
568 out->string = pool_malloc (pool, text.length + 1);
569 out->length = text.length;
570 memcpy (out->string, text.string, text.length);
571 out->string[out->length] = '\0';
578 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
580 char *output = pool_malloc (pool, bufsize);
583 retval = try_recode (conv, fallbackchar, text.string, text.length,
587 *out = ss_buffer (output, retval);
590 pool_free (pool, output);
592 if (retval != -E2BIG)
599 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
600 dynamically allocated string in TO-encoding. Any characters which cannot be
601 converted will be represented by '?'.
603 The returned string will be null-terminated and allocated on POOL with
606 This function's behaviour differs from that of g_convert_with_fallback
607 provided by GLib. The GLib function will fail (returns NULL) if any part of
608 the input string is not valid in the declared input encoding. This function
609 however perseveres even in the presence of badly encoded input. */
611 recode_substring_pool (const char *to, const char *from,
612 struct substring text, struct pool *pool)
614 struct substring out;
616 recode_substring_pool__ (to, from, text, '?', pool, &out);
620 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
621 dynamically allocated string in TO-encoding. On success, returns 0, and the
622 converted null-terminated string, allocated from POOL with pool_malloc(), is
623 stored in *OUT. On failure, returns a positive errno value.
625 The function fails with an error if any part of the input string is not
626 valid in the declared input encoding. */
628 recode_pedantically (const char *to, const char *from,
629 struct substring text, struct pool *pool,
630 struct substring *out)
634 error = recode_substring_pool__ (to, from, text, 0, pool, out);
643 setlocale (LC_ALL, "");
645 bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
647 textdomain (PACKAGE);
649 assert (default_encoding == NULL);
650 default_encoding = xstrdup (locale_charset ());
656 get_default_encoding (void)
658 return default_encoding;
662 set_default_encoding (const char *enc)
664 free (default_encoding);
665 default_encoding = xstrdup (enc);
668 /* Return the ISO two letter code for the current LC_MESSAGES
673 const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
674 if (0 == strcmp (localename, "C"))
676 char *ln = xstrdup (localename);
677 char *end = strchr (ln, '_');
684 /* Attempts to set the encoding from a locale name
685 returns true if successful.
686 This function does not (should not!) alter the current locale.
689 set_encoding_from_locale (const char *loc)
694 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
696 setlocale (LC_CTYPE, "C");
697 c_encoding = xstrdup (locale_charset ());
699 setlocale (LC_CTYPE, loc);
700 loc_encoding = xstrdup (locale_charset ());
703 if (0 == strcmp (loc_encoding, c_encoding))
708 setlocale (LC_CTYPE, tmp);
714 free (default_encoding);
715 default_encoding = loc_encoding;
728 struct hmapx_node *node;
729 struct converter *cvtr;
731 HMAPX_FOR_EACH (cvtr, node, &map)
736 free (cvtr->fromcode);
737 if (cvtr->conv != (iconv_t) -1)
738 iconv_close (cvtr->conv);
742 hmapx_destroy (&map);
744 free (default_encoding);
745 default_encoding = NULL;
751 valid_encoding (const char *enc)
753 iconv_t conv = iconv_open (UTF8, enc);
755 if (conv == (iconv_t) -1)
764 /* Return the system local's idea of the
765 decimal separator character */
767 get_system_decimal (void)
772 radix_char = nl_langinfo (RADIXCHAR)[0];
776 snprintf (buf, sizeof buf, "%f", 2.5);
785 uc_name (ucs4_t uc, char buffer[16])
787 if (uc >= 0x20 && uc < 0x7f)
788 snprintf (buffer, 16, "`%c'", uc);
790 snprintf (buffer, 16, "U+%04X", uc);
794 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
796 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
797 with lowercase and uppercase letters treated as equal, starting from
800 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
802 uint8_t folded_buf[2048];
803 size_t folded_len = sizeof folded_buf;
807 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
808 NULL, UNINORM_NFKD, folded_buf, &folded_len);
809 if (folded_s != NULL)
811 hash = hash_bytes (folded_s, folded_len, basis);
812 if (folded_s != folded_buf)
819 hash = hash_bytes (s, n, basis);
825 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
826 uppercase letters treated as equal, starting from BASIS. */
828 utf8_hash_case_string (const char *s, unsigned int basis)
830 return utf8_hash_case_bytes (s, strlen (s), basis);
833 /* Compares UTF-8 strings A and B case-insensitively.
834 Returns a negative value if A < B, zero if A == B, positive if A > B. */
836 utf8_strcasecmp (const char *a, const char *b)
838 return utf8_strncasecmp (a, strlen (a), b, strlen (b));
841 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
843 Returns a negative value if A < B, zero if A == B, positive if A > B. */
845 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
849 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
850 CHAR_CAST (const uint8_t *, b), bn,
851 NULL, UNINORM_NFKD, &result))
856 result = memcmp (a, b, MIN (an, bn));
858 result = an < bn ? -1 : an > bn;
865 is_all_digits (const uint8_t *s, size_t len)
867 for (size_t i = 0; i < len; i++)
868 if (!c_isdigit (s[i]))
873 /* Compares UTF-8 strings A and B case-insensitively. If the strings end in a
874 number, then they are compared numerically. Returns a negative value if A <
875 B, zero if A == B, positive if A > B. */
877 utf8_strverscasecmp (const char *a, const char *b)
881 size_t a_len = sizeof a_stub;
882 uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
883 UNINORM_NFKD, a_stub, &a_len);
887 size_t b_len = sizeof b_stub;
888 uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
889 UNINORM_NFKD, b_stub, &b_len);
892 if (!a_norm || !b_norm)
894 result = strcmp (a, b);
898 size_t len = MIN (a_len, b_len);
899 for (size_t i = 0; i < len; i++)
900 if (a_norm[i] != b_norm[i])
902 /* If both strings end in digits, compare them numerically. */
903 if (is_all_digits (&a_norm[i], a_len - i)
904 && is_all_digits (&b_norm[i], b_len - i))
906 /* Start by stripping leading zeros, since those don't matter for
907 numerical comparison. */
909 for (ap = i; ap < a_len; ap++)
910 if (a_norm[ap] != '0')
912 for (bp = i; bp < b_len; bp++)
913 if (b_norm[bp] != '0')
916 /* The number with more digits, if there is one, is larger. */
917 size_t a_digits = a_len - ap;
918 size_t b_digits = b_len - bp;
919 if (a_digits != b_digits)
920 result = a_digits > b_digits ? 1 : -1;
922 result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
925 result = a_norm[i] > b_norm[i] ? 1 : -1;
928 result = a_len < b_len ? -1 : a_len > b_len;
931 if (a_norm != a_stub)
933 if (b_norm != b_stub)
939 utf8_casemap (const char *s,
940 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
941 uint8_t *, size_t *))
946 result = CHAR_CAST (char *,
947 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
948 NULL, NULL, NULL, &size));
954 result = xstrdup (s);
960 utf8_to_upper (const char *s)
962 return utf8_casemap (s, u8_toupper);
966 utf8_to_lower (const char *s)
968 return utf8_casemap (s, u8_tolower);
972 utf8_to_title (const char *s)
974 return utf8_casemap (s, u8_totitle);
978 get_encoding_info (struct encoding_info *e, const char *name)
980 const struct substring in = SS_LITERAL_INITIALIZER (
982 "!\"#$%&'()*+,-./0123456789:;<=>?@"
983 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
984 "abcdefghijklmnopqrstuvwxyz{|}~");
986 struct substring out, cr, lf, space;
989 memset (e, 0, sizeof *e);
991 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
992 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
993 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
995 && cr.length <= MAX_UNIT
996 && cr.length == lf.length
997 && cr.length == space.length);
1000 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1003 ss_dealloc (&space);
1004 ss_alloc_substring (&cr, ss_cstr ("\r"));
1005 ss_alloc_substring (&lf, ss_cstr ("\n"));
1006 ss_alloc_substring (&space, ss_cstr (" "));
1009 e->unit = cr.length;
1010 memcpy (e->cr, cr.string, e->unit);
1011 memcpy (e->lf, lf.string, e->unit);
1012 memcpy (e->space, space.string, e->unit);
1016 ss_dealloc (&space);
1018 out = recode_substring_pool ("UTF-8", name, in, NULL);
1019 e->is_ascii_compatible = ss_equals (in, out);
1022 if (!e->is_ascii_compatible && e->unit == 1)
1024 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1025 e->is_ebcdic_compatible = (out.length == 1
1026 && (uint8_t) out.string[0] == 0xc1);
1030 e->is_ebcdic_compatible = false;
1036 is_encoding_ascii_compatible (const char *encoding)
1038 struct encoding_info e;
1040 get_encoding_info (&e, encoding);
1041 return e.is_ascii_compatible;
1045 is_encoding_ebcdic_compatible (const char *encoding)
1047 struct encoding_info e;
1049 get_encoding_info (&e, encoding);
1050 return e.is_ebcdic_compatible;
1053 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1056 is_encoding_supported (const char *encoding)
1058 return (create_iconv ("UTF-8", encoding)
1059 && create_iconv (encoding, "UTF-8"));
1062 /* Returns true if E is the name of a UTF-8 encoding.
1064 XXX Possibly we should test not E as a string but its properties via
1067 is_encoding_utf8 (const char *e)
1069 return ((e[0] == 'u' || e[0] == 'U')
1070 && (e[1] == 't' || e[1] == 'T')
1071 && (e[2] == 'f' || e[2] == 'F')
1072 && ((e[3] == '8' && e[4] == '\0')
1073 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1076 static struct encoding_category *categories;
1077 static int n_categories;
1079 static void SENTINEL (0)
1080 add_category (size_t *allocated_categories, const char *category, ...)
1082 struct encoding_category *c;
1083 const char *encodings[16];
1087 /* Count encoding arguments. */
1088 va_start (args, category);
1090 while ((encodings[n] = va_arg (args, const char *)) != NULL)
1092 const char *encoding = encodings[n];
1093 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1096 assert (n < sizeof encodings / sizeof *encodings);
1102 if (n_categories >= *allocated_categories)
1103 categories = x2nrealloc (categories,
1104 allocated_categories, sizeof *categories);
1106 c = &categories[n_categories++];
1107 c->category = category;
1108 c->encodings = xmalloc (n * sizeof *c->encodings);
1109 for (i = 0; i < n; i++)
1110 c->encodings[i] = encodings[i];
1115 init_encoding_categories (void)
1125 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1126 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1127 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1129 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1130 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1131 "Windows-1257", NULL_SENTINEL);
1132 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1133 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1134 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1135 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1136 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1137 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1138 "EUC-TW", NULL_SENTINEL);
1139 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1140 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1141 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1142 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1143 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1145 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1146 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1147 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1148 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1149 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1151 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1152 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1153 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1154 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1156 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1158 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1159 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1161 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1162 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1164 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1166 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1167 "Windows-1258", NULL_SENTINEL);
1168 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1169 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1172 /* Returns an array of "struct encoding_category" that contains only the
1173 categories and encodings that the system supports. */
1174 struct encoding_category *
1175 get_encoding_categories (void)
1177 init_encoding_categories ();
1181 /* Returns the number of elements in the array returned by
1182 get_encoding_categories(). */
1184 get_n_encoding_categories (void)
1186 init_encoding_categories ();
1187 return n_categories;