1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/i18n.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/compiler.h"
34 #include "libpspp/hmapx.h"
35 #include "libpspp/hash-functions.h"
36 #include "libpspp/pool.h"
37 #include "libpspp/str.h"
38 #include "libpspp/version.h"
40 #include "gl/c-ctype.h"
41 #include "gl/c-strcase.h"
42 #include "gl/localcharset.h"
43 #include "gl/minmax.h"
44 #include "gl/xalloc.h"
45 #include "gl/relocatable.h"
46 #include "gl/xstrndup.h"
49 #define _(msgid) gettext (msgid)
59 static char *default_encoding;
60 static struct hmapx map;
62 /* A wrapper around iconv_open */
63 static struct converter *
64 create_iconv (const char* tocode, const char* fromcode)
67 struct hmapx_node *node;
68 struct converter *converter;
71 hash = hash_string (tocode, hash_string (fromcode, 0));
72 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
77 if (!strcmp (tocode, converter->tocode)
78 && !strcmp (fromcode, converter->fromcode))
82 converter = xmalloc (sizeof *converter);
83 converter->tocode = xstrdup (tocode);
84 converter->fromcode = xstrdup (fromcode);
85 converter->conv = iconv_open (tocode, fromcode);
86 int error = converter->conv == (iconv_t) ~0 ? errno : 0;
87 /* I don't think it's safe to translate this string or to use messaging
88 as the converters have not yet been set up */
89 if (error && strcmp (tocode, fromcode))
93 "cannot create a converter for `%s' to `%s': %s\n",
94 fromcode, tocode, strerror (error));
96 free (converter->tocode);
97 free (converter->fromcode);
100 hmapx_insert (&map, NULL, hash);
104 /* Find out how many bytes there are in a null char in the target
106 iconv_t bconv = iconv_open (tocode, "ASCII");
107 if (bconv != (iconv_t) -1)
109 ICONV_CONST char *nullstr = strdup ("");
110 ICONV_CONST char *outbuf = strdup ("XXXXXXXX");
111 ICONV_CONST char *snullstr = nullstr;
112 ICONV_CONST char *soutbuf = outbuf;
115 const size_t bytes = 8;
116 size_t outbytes = bytes;
117 if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes))
118 converter->null_char_width = bytes - outbytes;
124 hmapx_insert (&map, converter, hash);
130 /* Converts the single byte C from encoding FROM to TO, returning the first
133 This function probably shouldn't be used at all, but some code still does
136 recode_byte (const char *to, const char *from, char c)
139 char *s = recode_string (to, from, &c, 1);
145 /* Similar to recode_string_pool, but allocates the returned value on the heap
146 instead of in a pool. It is the caller's responsibility to free the
149 recode_string (const char *to, const char *from,
150 const char *text, int length)
152 return recode_string_pool (to, from, text, length, NULL);
155 /* Returns the length, in bytes, of the string that a similar recode_string()
156 call would return. */
158 recode_string_len (const char *to, const char *from,
159 const char *text, int length)
161 char *s = recode_string (to, from, text, length);
162 size_t len = strlen (s);
167 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
168 at OP, and appends a null terminator to the output.
170 Returns the output length if successful, -1 if the output buffer is too
173 try_recode (struct converter *cvtr, char fallbackchar,
174 const char *in, size_t inbytes,
175 char *out_, size_t outbytes)
180 int null_bytes = cvtr->null_char_width;
182 /* Put the converter into the initial shift state, in case there was any
183 state information left over from its last usage. */
184 iconv (cvtr->conv, NULL, 0, NULL, 0);
186 /* Do two rounds of iconv() calls:
188 - The first round does the bulk of the conversion using the
189 caller-supplied input data..
191 - The second round flushes any leftover output. This has a real effect
192 with input encodings that use combining diacritics, e.g. without the
193 second round the last character tends to gets dropped when converting
194 from windows-1258 to other encodings.
196 for (i = 0; i < 2; i++)
198 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
199 size_t *inbytesp = i ? NULL : &inbytes;
201 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
205 if (outbytes < null_bytes + 1)
209 *out++ = fallbackchar;
210 for (j = 0 ; j < null_bytes ; ++j)
212 return out - 1 - out_;
219 *out++ = fallbackchar;
232 /* should never happen */
233 fprintf (stderr, "Character conversion error: %s\n",
240 if (outbytes <= null_bytes - 1)
243 for (i = 0 ; i < null_bytes ; ++i)
246 return out - 1 - out_;
249 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
250 dynamically allocated string in TO-encoding. Any characters which cannot be
251 converted will be represented by '?'.
253 LENGTH should be the length of the string or -1, if null terminated.
255 The returned string will be allocated on POOL.
257 This function's behaviour differs from that of g_convert_with_fallback
258 provided by GLib. The GLib function will fail (returns NULL) if any part of
259 the input string is not valid in the declared input encoding. This function
260 however perseveres even in the presence of badly encoded input. */
262 recode_string_pool (const char *to, const char *from,
263 const char *text, int length, struct pool *pool)
265 struct substring out;
271 length = strlen (text);
273 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
277 /* Returns the name of the encoding that should be used for file names.
279 This is meant to be the same encoding used by g_filename_from_uri() and
280 g_filename_to_uri() in GLib. */
282 filename_encoding (void)
284 #if defined _WIN32 || defined __WIN32__
287 return locale_charset ();
292 xconcat2 (const char *a, size_t a_len,
293 const char *b, size_t b_len)
295 char *s = xmalloc (a_len + b_len + 1);
296 memcpy (s, a, a_len);
297 memcpy (s + a_len, b, b_len);
298 s[a_len + b_len] = '\0';
302 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
303 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
304 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
305 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
306 HEAD and tries again, repeating as necessary until the concatenated result
307 fits or until HEAD_LEN reaches 0.
309 [*] Actually this function drops grapheme clusters instead of characters, so
310 that, e.g. a Unicode character followed by a combining accent character
311 is either completely included or completely excluded from HEAD_LEN. See
312 UAX #29 at http://unicode.org/reports/tr29/ for more information on
315 A null ENCODING is treated as UTF-8.
317 Sometimes this function has to actually construct the concatenated string to
318 measure its length. When this happens, it sets *RESULTP to that
319 null-terminated string, allocated with malloc(), for the caller to use if it
320 needs it. Otherwise, it sets *RESULTP to NULL.
322 Simple examples for encoding="UTF-8", max_len=6:
324 head="abc", tail="xyz" => 3
325 head="abcd", tail="xyz" => 3 ("d" dropped).
326 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
327 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
329 Examples for encoding="ISO-8859-1", max_len=6:
331 head="éèä", tail="xyz" => 6
332 (each letter in head is only 1 byte in ISO-8859-1 even though they
333 each take 2 bytes in UTF-8 encoding)
336 utf8_encoding_concat__ (const char *head, size_t head_len,
337 const char *tail, size_t tail_len,
338 const char *encoding, size_t max_len,
344 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
346 if (head_len + tail_len <= max_len)
348 else if (tail_len >= max_len)
358 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
360 ofs <= max_len - tail_len;
365 mblen = u8_mbtouc (&next,
366 CHAR_CAST (const uint8_t *, head + ofs),
368 if (uc_is_grapheme_break (prev, next))
381 result = (tail_len > 0
382 ? xconcat2 (head, head_len, tail, tail_len)
383 : CONST_CAST (char *, head));
384 if (recode_string_len (encoding, "UTF-8", result,
385 head_len + tail_len) <= max_len)
387 *resultp = result != head ? result : NULL;
392 bool correct_result = false;
399 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
406 mblen = u8_mbtouc (&next,
407 CHAR_CAST (const uint8_t *, head + ofs),
409 if (uc_is_grapheme_break (prev, next))
413 memcpy (result, head, ofs);
414 memcpy (result + ofs, tail, tail_len);
415 result[ofs + tail_len] = '\0';
418 if (recode_string_len (encoding, "UTF-8", result,
419 ofs + tail_len) <= max_len)
421 correct_result = true;
425 correct_result = false;
444 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
445 null-terminated string owned by the caller. HEAD, TAIL, and the returned
446 string are all encoded in UTF-8. As many characters[*] from the beginning
447 of HEAD are included as will fit within MAX_LEN bytes supposing that the
448 resulting string were to be re-encoded in ENCODING. All of TAIL is always
449 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
451 [*] Actually this function drops grapheme clusters instead of characters, so
452 that, e.g. a Unicode character followed by a combining accent character
453 is either completely included or completely excluded from the returned
454 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
455 information on grapheme clusters.
457 A null ENCODING is treated as UTF-8.
459 Simple examples for encoding="UTF-8", max_len=6:
461 head="abc", tail="xyz" => "abcxyz"
462 head="abcd", tail="xyz" => "abcxyz"
463 head="abc", tail="uvwxyz" => "uvwxyz"
464 head="abc", tail="tuvwxyz" => "tuvwxyz"
466 Examples for encoding="ISO-8859-1", max_len=6:
468 head="éèä", tail="xyz" => "éèäxyz"
469 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
470 each take 2 bytes in UTF-8 encoding)
473 utf8_encoding_concat (const char *head, const char *tail,
474 const char *encoding, size_t max_len)
476 size_t tail_len = strlen (tail);
480 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
481 encoding, max_len, &result);
482 return (result != NULL
484 : xconcat2 (head, prefix_len, tail, tail_len));
487 /* Returns the length, in bytes, of the string that would be returned by
488 utf8_encoding_concat() if passed the same arguments, but the implementation
489 is often more efficient. */
491 utf8_encoding_concat_len (const char *head, const char *tail,
492 const char *encoding, size_t max_len)
494 size_t tail_len = strlen (tail);
498 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
499 encoding, max_len, &result);
501 return prefix_len + tail_len;
504 /* Returns an allocated, null-terminated string, owned by the caller,
505 containing as many characters[*] from the beginning of S that would fit
506 within MAX_LEN bytes if the returned string were to be re-encoded in
507 ENCODING. Both S and the returned string are encoded in UTF-8.
509 [*] Actually this function drops grapheme clusters instead of characters, so
510 that, e.g. a Unicode character followed by a combining accent character
511 is either completely included or completely excluded from the returned
512 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
513 information on grapheme clusters.
515 A null ENCODING is treated as UTF-8.
518 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
520 return utf8_encoding_concat (s, "", encoding, max_len);
523 /* Returns the length, in bytes, of the string that would be returned by
524 utf8_encoding_trunc() if passed the same arguments, but the implementation
525 is often more efficient. */
527 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
529 return utf8_encoding_concat_len (s, "", encoding, max_len);
532 /* Returns FILENAME converted from UTF-8 to the filename encoding.
533 On Windows the filename encoding is UTF-8; elsewhere it is based on the
536 utf8_to_filename (const char *filename)
538 return recode_string (filename_encoding (), "UTF-8", filename, -1);
541 /* Returns FILENAME converted from the filename encoding to UTF-8.
542 On Windows the filename encoding is UTF-8; elsewhere it is based on the
545 filename_to_utf8 (const char *filename)
547 return recode_string ("UTF-8", filename_encoding (), filename, -1);
551 recode_substring_pool__ (const char *to, const char *from,
552 struct substring text, char fallbackchar,
553 struct pool *pool, struct substring *out)
556 struct converter *conv;
559 to = default_encoding;
562 from = default_encoding;
564 conv = create_iconv (to, from);
570 out->string = pool_malloc (pool, text.length + 1);
571 out->length = text.length;
572 memcpy (out->string, text.string, text.length);
573 out->string[out->length] = '\0';
580 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
582 char *output = pool_malloc (pool, bufsize);
585 retval = try_recode (conv, fallbackchar, text.string, text.length,
589 *out = ss_buffer (output, retval);
592 pool_free (pool, output);
594 if (retval != -E2BIG)
601 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
602 dynamically allocated string in TO-encoding. Any characters which cannot be
603 converted will be represented by '?'.
605 The returned string will be null-terminated and allocated on POOL with
608 This function's behaviour differs from that of g_convert_with_fallback
609 provided by GLib. The GLib function will fail (returns NULL) if any part of
610 the input string is not valid in the declared input encoding. This function
611 however perseveres even in the presence of badly encoded input. */
613 recode_substring_pool (const char *to, const char *from,
614 struct substring text, struct pool *pool)
616 struct substring out;
618 recode_substring_pool__ (to, from, text, '?', pool, &out);
622 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
623 dynamically allocated string in TO-encoding. On success, returns 0, and the
624 converted null-terminated string, allocated from POOL with pool_malloc(), is
625 stored in *OUT. On failure, returns a positive errno value.
627 The function fails with an error if any part of the input string is not
628 valid in the declared input encoding. */
630 recode_pedantically (const char *to, const char *from,
631 struct substring text, struct pool *pool,
632 struct substring *out)
636 error = recode_substring_pool__ (to, from, text, 0, pool, out);
645 setlocale (LC_ALL, "");
646 bindtextdomain (PACKAGE, relocate(locale_dir));
647 textdomain (PACKAGE);
649 assert (default_encoding == NULL);
650 default_encoding = xstrdup (locale_charset ());
656 get_default_encoding (void)
658 return default_encoding;
662 set_default_encoding (const char *enc)
664 free (default_encoding);
665 default_encoding = xstrdup (enc);
669 /* Attempts to set the encoding from a locale name
670 returns true if successfull.
671 This function does not (should not!) alter the current locale.
674 set_encoding_from_locale (const char *loc)
679 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
681 setlocale (LC_CTYPE, "C");
682 c_encoding = xstrdup (locale_charset ());
684 setlocale (LC_CTYPE, loc);
685 loc_encoding = xstrdup (locale_charset ());
688 if ( 0 == strcmp (loc_encoding, c_encoding))
693 setlocale (LC_CTYPE, tmp);
699 free (default_encoding);
700 default_encoding = loc_encoding;
713 struct hmapx_node *node;
714 struct converter *cvtr;
716 HMAPX_FOR_EACH (cvtr, node, &map)
721 free (cvtr->fromcode);
722 if (cvtr->conv != (iconv_t) -1)
723 iconv_close (cvtr->conv);
727 hmapx_destroy (&map);
729 free (default_encoding);
730 default_encoding = NULL;
736 valid_encoding (const char *enc)
738 iconv_t conv = iconv_open (UTF8, enc);
740 if ( conv == (iconv_t) -1)
749 /* Return the system local's idea of the
750 decimal seperator character */
752 get_system_decimal (void)
757 radix_char = nl_langinfo (RADIXCHAR)[0];
761 snprintf (buf, sizeof buf, "%f", 2.5);
770 uc_name (ucs4_t uc, char buffer[16])
772 if (uc >= 0x20 && uc < 0x7f)
773 snprintf (buffer, 16, "`%c'", uc);
775 snprintf (buffer, 16, "U+%04X", uc);
779 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
781 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
782 with lowercase and uppercase letters treated as equal, starting from
785 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
787 uint8_t folded_buf[2048];
788 size_t folded_len = sizeof folded_buf;
792 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
793 NULL, UNINORM_NFKD, folded_buf, &folded_len);
794 if (folded_s != NULL)
796 hash = hash_bytes (folded_s, folded_len, basis);
797 if (folded_s != folded_buf)
804 hash = hash_bytes (s, n, basis);
810 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
811 uppercase letters treated as equal, starting from BASIS. */
813 utf8_hash_case_string (const char *s, unsigned int basis)
815 return utf8_hash_case_bytes (s, strlen (s), basis);
818 /* Compares UTF-8 strings A and B case-insensitively.
819 Returns a negative value if A < B, zero if A == B, positive if A > B. */
821 utf8_strcasecmp (const char *a, const char *b)
823 return utf8_strncasecmp (a, strlen (a), b, strlen (b));
826 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
828 Returns a negative value if A < B, zero if A == B, positive if A > B. */
830 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
834 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
835 CHAR_CAST (const uint8_t *, b), bn,
836 NULL, UNINORM_NFKD, &result))
841 result = memcmp (a, b, MIN (an, bn));
843 result = an < bn ? -1 : an > bn;
850 is_all_digits (const uint8_t *s, size_t len)
852 for (size_t i = 0; i < len; i++)
853 if (!c_isdigit (s[i]))
858 /* Compares UTF-8 strings A and B case-insensitively. If the strings end in a
859 number, then they are compared numerically. Returns a negative value if A <
860 B, zero if A == B, positive if A > B. */
862 utf8_strverscasecmp (const char *a, const char *b)
866 size_t a_len = sizeof a_stub;
867 uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
868 UNINORM_NFKD, a_stub, &a_len);
872 size_t b_len = sizeof b_stub;
873 uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
874 UNINORM_NFKD, b_stub, &b_len);
877 if (!a_norm || !b_norm)
879 result = strcmp (a, b);
883 size_t len = MIN (a_len, b_len);
884 for (size_t i = 0; i < len; i++)
885 if (a_norm[i] != b_norm[i])
887 /* If both strings end in digits, compare them numerically. */
888 if (is_all_digits (&a_norm[i], a_len - i)
889 && is_all_digits (&b_norm[i], b_len - i))
891 /* Start by stripping leading zeros, since those don't matter for
892 numerical comparison. */
894 for (ap = i; ap < a_len; ap++)
895 if (a_norm[ap] != '0')
897 for (bp = i; bp < b_len; bp++)
898 if (b_norm[bp] != '0')
901 /* The number with more digits, if there is one, is larger. */
902 size_t a_digits = a_len - ap;
903 size_t b_digits = b_len - bp;
904 if (a_digits != b_digits)
905 result = a_digits > b_digits ? 1 : -1;
907 result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
910 result = a_norm[i] > b_norm[i] ? 1 : -1;
913 result = a_len < b_len ? -1 : a_len > b_len;
916 if (a_norm != a_stub)
918 if (b_norm != b_stub)
924 utf8_casemap (const char *s,
925 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
926 uint8_t *, size_t *))
931 result = CHAR_CAST (char *,
932 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
933 NULL, NULL, NULL, &size));
939 result = xstrdup (s);
945 utf8_to_upper (const char *s)
947 return utf8_casemap (s, u8_toupper);
951 utf8_to_lower (const char *s)
953 return utf8_casemap (s, u8_tolower);
957 get_encoding_info (struct encoding_info *e, const char *name)
959 const struct substring in = SS_LITERAL_INITIALIZER (
961 "!\"#$%&'()*+,-./0123456789:;<=>?@"
962 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
963 "abcdefghijklmnopqrstuvwxyz{|}~");
965 struct substring out, cr, lf, space;
968 memset (e, 0, sizeof *e);
970 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
971 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
972 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
974 && cr.length <= MAX_UNIT
975 && cr.length == lf.length
976 && cr.length == space.length);
979 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
983 ss_alloc_substring (&cr, ss_cstr ("\r"));
984 ss_alloc_substring (&lf, ss_cstr ("\n"));
985 ss_alloc_substring (&space, ss_cstr (" "));
989 memcpy (e->cr, cr.string, e->unit);
990 memcpy (e->lf, lf.string, e->unit);
991 memcpy (e->space, space.string, e->unit);
997 out = recode_substring_pool ("UTF-8", name, in, NULL);
998 e->is_ascii_compatible = ss_equals (in, out);
1001 if (!e->is_ascii_compatible && e->unit == 1)
1003 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1004 e->is_ebcdic_compatible = (out.length == 1
1005 && (uint8_t) out.string[0] == 0xc1);
1009 e->is_ebcdic_compatible = false;
1015 is_encoding_ascii_compatible (const char *encoding)
1017 struct encoding_info e;
1019 get_encoding_info (&e, encoding);
1020 return e.is_ascii_compatible;
1024 is_encoding_ebcdic_compatible (const char *encoding)
1026 struct encoding_info e;
1028 get_encoding_info (&e, encoding);
1029 return e.is_ebcdic_compatible;
1032 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1035 is_encoding_supported (const char *encoding)
1037 return (create_iconv ("UTF-8", encoding)
1038 && create_iconv (encoding, "UTF-8"));
1041 /* Returns true if E is the name of a UTF-8 encoding.
1043 XXX Possibly we should test not E as a string but its properties via
1046 is_encoding_utf8 (const char *e)
1048 return ((e[0] == 'u' || e[0] == 'U')
1049 && (e[1] == 't' || e[1] == 'T')
1050 && (e[2] == 'f' || e[2] == 'F')
1051 && ((e[3] == '8' && e[4] == '\0')
1052 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1055 static struct encoding_category *categories;
1056 static int n_categories;
1058 static void SENTINEL (0)
1059 add_category (size_t *allocated_categories, const char *category, ...)
1061 struct encoding_category *c;
1062 const char *encodings[16];
1066 /* Count encoding arguments. */
1067 va_start (args, category);
1069 while ((encodings[n] = va_arg (args, const char *)) != NULL)
1071 const char *encoding = encodings[n];
1072 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1075 assert (n < sizeof encodings / sizeof *encodings);
1081 if (n_categories >= *allocated_categories)
1082 categories = x2nrealloc (categories,
1083 allocated_categories, sizeof *categories);
1085 c = &categories[n_categories++];
1086 c->category = category;
1087 c->encodings = xmalloc (n * sizeof *c->encodings);
1088 for (i = 0; i < n; i++)
1089 c->encodings[i] = encodings[i];
1094 init_encoding_categories (void)
1104 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1105 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1106 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1108 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1109 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1110 "Windows-1257", NULL_SENTINEL);
1111 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1112 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1113 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1114 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1115 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1116 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1117 "EUC-TW", NULL_SENTINEL);
1118 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1119 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1120 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1121 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1122 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1124 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1125 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1126 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1127 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1128 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1130 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1131 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1132 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1133 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1135 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1137 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1138 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1140 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1141 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1143 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1145 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1146 "Windows-1258", NULL_SENTINEL);
1147 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1148 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1151 /* Returns an array of "struct encoding_category" that contains only the
1152 categories and encodings that the system supports. */
1153 struct encoding_category *
1154 get_encoding_categories (void)
1156 init_encoding_categories ();
1160 /* Returns the number of elements in the array returned by
1161 get_encoding_categories(). */
1163 get_n_encoding_categories (void)
1165 init_encoding_categories ();
1166 return n_categories;