1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/i18n.h"
32 #include "libpspp/assertion.h"
33 #include "libpspp/compiler.h"
34 #include "libpspp/hmapx.h"
35 #include "libpspp/hash-functions.h"
36 #include "libpspp/pool.h"
37 #include "libpspp/str.h"
38 #include "libpspp/version.h"
40 #include "gl/c-strcase.h"
41 #include "gl/localcharset.h"
42 #include "gl/minmax.h"
43 #include "gl/xalloc.h"
44 #include "gl/relocatable.h"
45 #include "gl/xstrndup.h"
48 #define _(msgid) gettext (msgid)
58 static char *default_encoding;
59 static struct hmapx map;
61 /* A wrapper around iconv_open */
62 static struct converter *
63 create_iconv (const char* tocode, const char* fromcode)
66 struct hmapx_node *node;
67 struct converter *converter;
70 hash = hash_string (tocode, hash_string (fromcode, 0));
71 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
76 if (!strcmp (tocode, converter->tocode)
77 && !strcmp (fromcode, converter->fromcode))
81 converter = xmalloc (sizeof *converter);
82 converter->tocode = xstrdup (tocode);
83 converter->fromcode = xstrdup (fromcode);
84 converter->conv = iconv_open (tocode, fromcode);
85 int error = converter->conv == (iconv_t) -1 ? errno : 0;
86 /* I don't think it's safe to translate this string or to use messaging
87 as the converters have not yet been set up */
88 if (error && strcmp (tocode, fromcode))
92 "cannot create a converter for `%s' to `%s': %s\n",
93 fromcode, tocode, strerror (error));
95 hmapx_insert (&map, NULL, hash);
99 /* Find out how many bytes there are in a null char in the target
101 iconv_t bconv = iconv_open (tocode, "ASCII");
102 if (bconv != (iconv_t) -1)
104 ICONV_CONST char *nullstr = strdup ("");
105 ICONV_CONST char *outbuf = strdup ("XXXXXXXX");
106 ICONV_CONST char *snullstr = nullstr;
107 ICONV_CONST char *soutbuf = outbuf;
110 const size_t bytes = 8;
111 size_t outbytes = bytes;
112 if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes))
113 converter->null_char_width = bytes - outbytes;
119 hmapx_insert (&map, converter, hash);
125 /* Converts the single byte C from encoding FROM to TO, returning the first
128 This function probably shouldn't be used at all, but some code still does
131 recode_byte (const char *to, const char *from, char c)
134 char *s = recode_string (to, from, &c, 1);
140 /* Similar to recode_string_pool, but allocates the returned value on the heap
141 instead of in a pool. It is the caller's responsibility to free the
144 recode_string (const char *to, const char *from,
145 const char *text, int length)
147 return recode_string_pool (to, from, text, length, NULL);
150 /* Returns the length, in bytes, of the string that a similar recode_string()
151 call would return. */
153 recode_string_len (const char *to, const char *from,
154 const char *text, int length)
156 char *s = recode_string (to, from, text, length);
157 size_t len = strlen (s);
162 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
163 at OP, and appends a null terminator to the output.
165 Returns the output length if successful, -1 if the output buffer is too
168 try_recode (struct converter *cvtr, char fallbackchar,
169 const char *in, size_t inbytes,
170 char *out_, size_t outbytes)
175 int null_bytes = cvtr->null_char_width;
177 /* Put the converter into the initial shift state, in case there was any
178 state information left over from its last usage. */
179 iconv (cvtr->conv, NULL, 0, NULL, 0);
181 /* Do two rounds of iconv() calls:
183 - The first round does the bulk of the conversion using the
184 caller-supplied input data..
186 - The second round flushes any leftover output. This has a real effect
187 with input encodings that use combining diacritics, e.g. without the
188 second round the last character tends to gets dropped when converting
189 from windows-1258 to other encodings.
191 for (i = 0; i < 2; i++)
193 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
194 size_t *inbytesp = i ? NULL : &inbytes;
196 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
200 if (outbytes < null_bytes + 1)
204 *out++ = fallbackchar;
205 for (j = 0 ; j < null_bytes ; ++j)
207 return out - 1 - out_;
214 *out++ = fallbackchar;
227 /* should never happen */
228 fprintf (stderr, "Character conversion error: %s\n",
235 if (outbytes <= null_bytes - 1)
238 for (i = 0 ; i < null_bytes ; ++i)
241 return out - 1 - out_;
244 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
245 dynamically allocated string in TO-encoding. Any characters which cannot be
246 converted will be represented by '?'.
248 LENGTH should be the length of the string or -1, if null terminated.
250 The returned string will be allocated on POOL.
252 This function's behaviour differs from that of g_convert_with_fallback
253 provided by GLib. The GLib function will fail (returns NULL) if any part of
254 the input string is not valid in the declared input encoding. This function
255 however perseveres even in the presence of badly encoded input. */
257 recode_string_pool (const char *to, const char *from,
258 const char *text, int length, struct pool *pool)
260 struct substring out;
266 length = strlen (text);
268 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
272 /* Returns the name of the encoding that should be used for file names.
274 This is meant to be the same encoding used by g_filename_from_uri() and
275 g_filename_to_uri() in GLib. */
277 filename_encoding (void)
279 #if defined _WIN32 || defined __WIN32__
282 return locale_charset ();
287 xconcat2 (const char *a, size_t a_len,
288 const char *b, size_t b_len)
290 char *s = xmalloc (a_len + b_len + 1);
291 memcpy (s, a, a_len);
292 memcpy (s + a_len, b, b_len);
293 s[a_len + b_len] = '\0';
297 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
298 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
299 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
300 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
301 HEAD and tries again, repeating as necessary until the concatenated result
302 fits or until HEAD_LEN reaches 0.
304 [*] Actually this function drops grapheme clusters instead of characters, so
305 that, e.g. a Unicode character followed by a combining accent character
306 is either completely included or completely excluded from HEAD_LEN. See
307 UAX #29 at http://unicode.org/reports/tr29/ for more information on
310 A null ENCODING is treated as UTF-8.
312 Sometimes this function has to actually construct the concatenated string to
313 measure its length. When this happens, it sets *RESULTP to that
314 null-terminated string, allocated with malloc(), for the caller to use if it
315 needs it. Otherwise, it sets *RESULTP to NULL.
317 Simple examples for encoding="UTF-8", max_len=6:
319 head="abc", tail="xyz" => 3
320 head="abcd", tail="xyz" => 3 ("d" dropped).
321 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
322 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
324 Examples for encoding="ISO-8859-1", max_len=6:
326 head="éèä", tail="xyz" => 6
327 (each letter in head is only 1 byte in ISO-8859-1 even though they
328 each take 2 bytes in UTF-8 encoding)
331 utf8_encoding_concat__ (const char *head, size_t head_len,
332 const char *tail, size_t tail_len,
333 const char *encoding, size_t max_len,
339 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
341 if (head_len + tail_len <= max_len)
343 else if (tail_len >= max_len)
353 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
355 ofs <= max_len - tail_len;
360 mblen = u8_mbtouc (&next,
361 CHAR_CAST (const uint8_t *, head + ofs),
363 if (uc_is_grapheme_break (prev, next))
376 result = (tail_len > 0
377 ? xconcat2 (head, head_len, tail, tail_len)
378 : CONST_CAST (char *, head));
379 if (recode_string_len (encoding, "UTF-8", result,
380 head_len + tail_len) <= max_len)
382 *resultp = result != head ? result : NULL;
387 bool correct_result = false;
394 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
401 mblen = u8_mbtouc (&next,
402 CHAR_CAST (const uint8_t *, head + ofs),
404 if (uc_is_grapheme_break (prev, next))
408 memcpy (result, head, ofs);
409 memcpy (result + ofs, tail, tail_len);
410 result[ofs + tail_len] = '\0';
413 if (recode_string_len (encoding, "UTF-8", result,
414 ofs + tail_len) <= max_len)
416 correct_result = true;
420 correct_result = false;
439 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
440 null-terminated string owned by the caller. HEAD, TAIL, and the returned
441 string are all encoded in UTF-8. As many characters[*] from the beginning
442 of HEAD are included as will fit within MAX_LEN bytes supposing that the
443 resulting string were to be re-encoded in ENCODING. All of TAIL is always
444 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
446 [*] Actually this function drops grapheme clusters instead of characters, so
447 that, e.g. a Unicode character followed by a combining accent character
448 is either completely included or completely excluded from the returned
449 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
450 information on grapheme clusters.
452 A null ENCODING is treated as UTF-8.
454 Simple examples for encoding="UTF-8", max_len=6:
456 head="abc", tail="xyz" => "abcxyz"
457 head="abcd", tail="xyz" => "abcxyz"
458 head="abc", tail="uvwxyz" => "uvwxyz"
459 head="abc", tail="tuvwxyz" => "tuvwxyz"
461 Examples for encoding="ISO-8859-1", max_len=6:
463 head="éèä", tail="xyz" => "éèäxyz"
464 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
465 each take 2 bytes in UTF-8 encoding)
468 utf8_encoding_concat (const char *head, const char *tail,
469 const char *encoding, size_t max_len)
471 size_t tail_len = strlen (tail);
475 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
476 encoding, max_len, &result);
477 return (result != NULL
479 : xconcat2 (head, prefix_len, tail, tail_len));
482 /* Returns the length, in bytes, of the string that would be returned by
483 utf8_encoding_concat() if passed the same arguments, but the implementation
484 is often more efficient. */
486 utf8_encoding_concat_len (const char *head, const char *tail,
487 const char *encoding, size_t max_len)
489 size_t tail_len = strlen (tail);
493 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
494 encoding, max_len, &result);
496 return prefix_len + tail_len;
499 /* Returns an allocated, null-terminated string, owned by the caller,
500 containing as many characters[*] from the beginning of S that would fit
501 within MAX_LEN bytes if the returned string were to be re-encoded in
502 ENCODING. Both S and the returned string are encoded in UTF-8.
504 [*] Actually this function drops grapheme clusters instead of characters, so
505 that, e.g. a Unicode character followed by a combining accent character
506 is either completely included or completely excluded from the returned
507 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
508 information on grapheme clusters.
510 A null ENCODING is treated as UTF-8.
513 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
515 return utf8_encoding_concat (s, "", encoding, max_len);
518 /* Returns the length, in bytes, of the string that would be returned by
519 utf8_encoding_trunc() if passed the same arguments, but the implementation
520 is often more efficient. */
522 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
524 return utf8_encoding_concat_len (s, "", encoding, max_len);
527 /* Returns FILENAME converted from UTF-8 to the filename encoding.
528 On Windows the filename encoding is UTF-8; elsewhere it is based on the
531 utf8_to_filename (const char *filename)
533 return recode_string (filename_encoding (), "UTF-8", filename, -1);
536 /* Returns FILENAME converted from the filename encoding to UTF-8.
537 On Windows the filename encoding is UTF-8; elsewhere it is based on the
540 filename_to_utf8 (const char *filename)
542 return recode_string ("UTF-8", filename_encoding (), filename, -1);
546 recode_substring_pool__ (const char *to, const char *from,
547 struct substring text, char fallbackchar,
548 struct pool *pool, struct substring *out)
551 struct converter *conv;
554 to = default_encoding;
557 from = default_encoding;
559 conv = create_iconv (to, from);
565 out->string = pool_malloc (pool, text.length + 1);
566 out->length = text.length;
567 memcpy (out->string, text.string, text.length);
568 out->string[out->length] = '\0';
575 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
577 char *output = pool_malloc (pool, bufsize);
580 retval = try_recode (conv, fallbackchar, text.string, text.length,
584 *out = ss_buffer (output, retval);
587 pool_free (pool, output);
589 if (retval != -E2BIG)
596 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
597 dynamically allocated string in TO-encoding. Any characters which cannot be
598 converted will be represented by '?'.
600 The returned string will be null-terminated and allocated on POOL with
603 This function's behaviour differs from that of g_convert_with_fallback
604 provided by GLib. The GLib function will fail (returns NULL) if any part of
605 the input string is not valid in the declared input encoding. This function
606 however perseveres even in the presence of badly encoded input. */
608 recode_substring_pool (const char *to, const char *from,
609 struct substring text, struct pool *pool)
611 struct substring out;
613 recode_substring_pool__ (to, from, text, '?', pool, &out);
617 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
618 dynamically allocated string in TO-encoding. On success, returns 0, and the
619 converted null-terminated string, allocated from POOL with pool_malloc(), is
620 stored in *OUT. On failure, returns a positive errno value.
622 The function fails with an error if any part of the input string is not
623 valid in the declared input encoding. */
625 recode_pedantically (const char *to, const char *from,
626 struct substring text, struct pool *pool,
627 struct substring *out)
631 error = recode_substring_pool__ (to, from, text, 0, pool, out);
640 setlocale (LC_ALL, "");
641 bindtextdomain (PACKAGE, relocate(locale_dir));
642 textdomain (PACKAGE);
644 assert (default_encoding == NULL);
645 default_encoding = xstrdup (locale_charset ());
651 get_default_encoding (void)
653 return default_encoding;
657 set_default_encoding (const char *enc)
659 free (default_encoding);
660 default_encoding = xstrdup (enc);
664 /* Attempts to set the encoding from a locale name
665 returns true if successfull.
666 This function does not (should not!) alter the current locale.
669 set_encoding_from_locale (const char *loc)
674 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
676 setlocale (LC_CTYPE, "C");
677 c_encoding = xstrdup (locale_charset ());
679 setlocale (LC_CTYPE, loc);
680 loc_encoding = xstrdup (locale_charset ());
683 if ( 0 == strcmp (loc_encoding, c_encoding))
688 setlocale (LC_CTYPE, tmp);
694 free (default_encoding);
695 default_encoding = loc_encoding;
708 struct hmapx_node *node;
709 struct converter *cvtr;
711 HMAPX_FOR_EACH (cvtr, node, &map)
716 free (cvtr->fromcode);
717 if (cvtr->conv != (iconv_t) -1)
718 iconv_close (cvtr->conv);
722 hmapx_destroy (&map);
724 free (default_encoding);
725 default_encoding = NULL;
731 valid_encoding (const char *enc)
733 iconv_t conv = iconv_open (UTF8, enc);
735 if ( conv == (iconv_t) -1)
744 /* Return the system local's idea of the
745 decimal seperator character */
747 get_system_decimal (void)
752 radix_char = nl_langinfo (RADIXCHAR)[0];
756 snprintf (buf, sizeof buf, "%f", 2.5);
765 uc_name (ucs4_t uc, char buffer[16])
767 if (uc >= 0x20 && uc < 0x7f)
768 snprintf (buffer, 16, "`%c'", uc);
770 snprintf (buffer, 16, "U+%04X", uc);
774 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
776 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
777 with lowercase and uppercase letters treated as equal, starting from
780 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
782 uint8_t folded_buf[2048];
783 size_t folded_len = sizeof folded_buf;
787 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
788 NULL, UNINORM_NFKD, folded_buf, &folded_len);
789 if (folded_s != NULL)
791 hash = hash_bytes (folded_s, folded_len, basis);
792 if (folded_s != folded_buf)
799 hash = hash_bytes (s, n, basis);
805 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
806 uppercase letters treated as equal, starting from BASIS. */
808 utf8_hash_case_string (const char *s, unsigned int basis)
810 return utf8_hash_case_bytes (s, strlen (s), basis);
813 /* Compares UTF-8 strings A and B case-insensitively.
814 Returns a negative value if A < B, zero if A == B, positive if A > B. */
816 utf8_strcasecmp (const char *a, const char *b)
818 return utf8_strncasecmp (a, strlen (a), b, strlen (b));
821 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
823 Returns a negative value if A < B, zero if A == B, positive if A > B. */
825 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
829 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
830 CHAR_CAST (const uint8_t *, b), bn,
831 NULL, UNINORM_NFKD, &result))
836 result = memcmp (a, b, MIN (an, bn));
838 result = an < bn ? -1 : an > bn;
845 utf8_casemap (const char *s,
846 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
847 uint8_t *, size_t *))
852 result = CHAR_CAST (char *,
853 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
854 NULL, NULL, NULL, &size));
860 result = xstrdup (s);
866 utf8_to_upper (const char *s)
868 return utf8_casemap (s, u8_toupper);
872 utf8_to_lower (const char *s)
874 return utf8_casemap (s, u8_tolower);
878 get_encoding_info (struct encoding_info *e, const char *name)
880 const struct substring in = SS_LITERAL_INITIALIZER (
882 "!\"#$%&'()*+,-./0123456789:;<=>?@"
883 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
884 "abcdefghijklmnopqrstuvwxyz{|}~");
886 struct substring out, cr, lf, space;
889 memset (e, 0, sizeof *e);
891 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
892 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
893 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
895 && cr.length <= MAX_UNIT
896 && cr.length == lf.length
897 && cr.length == space.length);
900 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
904 ss_alloc_substring (&cr, ss_cstr ("\r"));
905 ss_alloc_substring (&lf, ss_cstr ("\n"));
906 ss_alloc_substring (&space, ss_cstr (" "));
910 memcpy (e->cr, cr.string, e->unit);
911 memcpy (e->lf, lf.string, e->unit);
912 memcpy (e->space, space.string, e->unit);
918 out = recode_substring_pool ("UTF-8", name, in, NULL);
919 e->is_ascii_compatible = ss_equals (in, out);
922 if (!e->is_ascii_compatible && e->unit == 1)
924 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
925 e->is_ebcdic_compatible = (out.length == 1
926 && (uint8_t) out.string[0] == 0xc1);
930 e->is_ebcdic_compatible = false;
936 is_encoding_ascii_compatible (const char *encoding)
938 struct encoding_info e;
940 get_encoding_info (&e, encoding);
941 return e.is_ascii_compatible;
945 is_encoding_ebcdic_compatible (const char *encoding)
947 struct encoding_info e;
949 get_encoding_info (&e, encoding);
950 return e.is_ebcdic_compatible;
953 /* Returns true if iconv can convert ENCODING to and from UTF-8,
956 is_encoding_supported (const char *encoding)
958 return (create_iconv ("UTF-8", encoding)
959 && create_iconv (encoding, "UTF-8"));
962 /* Returns true if E is the name of a UTF-8 encoding.
964 XXX Possibly we should test not E as a string but its properties via
967 is_encoding_utf8 (const char *e)
969 return ((e[0] == 'u' || e[0] == 'U')
970 && (e[1] == 't' || e[1] == 'T')
971 && (e[2] == 'f' || e[2] == 'F')
972 && ((e[3] == '8' && e[4] == '\0')
973 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
976 static struct encoding_category *categories;
977 static int n_categories;
979 static void SENTINEL (0)
980 add_category (size_t *allocated_categories, const char *category, ...)
982 struct encoding_category *c;
983 const char *encodings[16];
987 /* Count encoding arguments. */
988 va_start (args, category);
990 while ((encodings[n] = va_arg (args, const char *)) != NULL)
992 const char *encoding = encodings[n];
993 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
996 assert (n < sizeof encodings / sizeof *encodings);
1002 if (n_categories >= *allocated_categories)
1003 categories = x2nrealloc (categories,
1004 allocated_categories, sizeof *categories);
1006 c = &categories[n_categories++];
1007 c->category = category;
1008 c->encodings = xmalloc (n * sizeof *c->encodings);
1009 for (i = 0; i < n; i++)
1010 c->encodings[i] = encodings[i];
1015 init_encoding_categories (void)
1025 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1026 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1027 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1029 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1030 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1031 "Windows-1257", NULL_SENTINEL);
1032 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1033 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1034 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1035 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1036 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1037 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1038 "EUC-TW", NULL_SENTINEL);
1039 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1040 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1041 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1042 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1043 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1045 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1046 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1047 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1048 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1049 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1051 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1052 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1053 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1054 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1056 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1058 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1059 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1061 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1062 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1064 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1066 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1067 "Windows-1258", NULL_SENTINEL);
1068 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1069 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1072 /* Returns an array of "struct encoding_category" that contains only the
1073 categories and encodings that the system supports. */
1074 struct encoding_category *
1075 get_encoding_categories (void)
1077 init_encoding_categories ();
1081 /* Returns the number of elements in the array returned by
1082 get_encoding_categories(). */
1084 get_n_encoding_categories (void)
1086 init_encoding_categories ();
1087 return n_categories;