1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
3 2016, 2021 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "libpspp/i18n.h"
33 #include "libpspp/assertion.h"
34 #include "libpspp/compiler.h"
35 #include "libpspp/hmapx.h"
36 #include "libpspp/hash-functions.h"
37 #include "libpspp/pool.h"
38 #include "libpspp/str.h"
39 #include "libpspp/version.h"
41 #include "gl/c-ctype.h"
42 #include "gl/c-strcase.h"
43 #include "gl/localcharset.h"
44 #include <gl/localename.h>
45 #include "gl/minmax.h"
46 #include "gl/xalloc.h"
47 #include "gl/relocatable.h"
48 #include "gl/xstrndup.h"
51 #define _(msgid) gettext (msgid)
61 static char *default_encoding;
62 static struct hmapx map;
64 /* A wrapper around iconv_open */
65 static struct converter *
66 create_iconv (const char* tocode, const char* fromcode, bool warn)
69 struct hmapx_node *node;
70 struct converter *converter;
73 hash = hash_string (tocode, hash_string (fromcode, 0));
74 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
79 if (!strcmp (tocode, converter->tocode)
80 && !strcmp (fromcode, converter->fromcode))
84 converter = xmalloc (sizeof *converter);
85 converter->tocode = xstrdup (tocode);
86 converter->fromcode = xstrdup (fromcode);
87 converter->conv = iconv_open (tocode, fromcode);
88 int error = converter->conv == (iconv_t) ~0 ? errno : 0;
89 /* I don't think it's safe to translate this string or to use messaging
90 as the converters have not yet been set up */
91 if (error && strcmp (tocode, fromcode))
96 "cannot create a converter for `%s' to `%s': %s\n",
97 fromcode, tocode, strerror (error));
99 free (converter->tocode);
100 free (converter->fromcode);
103 hmapx_insert (&map, NULL, hash);
107 /* Find out how many bytes there are in a null char in the target
109 iconv_t bconv = iconv_open (tocode, "ASCII");
110 if (bconv != (iconv_t) -1)
112 ICONV_CONST char inbuf[1] = "";
113 ICONV_CONST char *inptr = inbuf;
114 size_t inbytes = sizeof inbuf;
117 char *outptr = outbuf;
118 size_t outbytes = sizeof outbuf;
119 if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
120 converter->null_char_width = outptr - outbuf;
124 hmapx_insert (&map, converter, hash);
130 /* Converts the single byte C from encoding FROM to TO, returning the first
133 This function probably shouldn't be used at all, but some code still does
136 recode_byte (const char *to, const char *from, char c)
139 char *s = recode_string (to, from, &c, 1);
145 /* Similar to recode_string_pool, but allocates the returned value on the heap
146 instead of in a pool. It is the caller's responsibility to free the
149 recode_string (const char *to, const char *from,
150 const char *text, int length)
152 return recode_string_pool (to, from, text, length, NULL);
155 /* Returns the length, in bytes, of the string that a similar recode_string()
156 call would return. */
158 recode_string_len (const char *to, const char *from,
159 const char *text, int length)
161 char *s = recode_string (to, from, text, length);
162 size_t len = strlen (s);
167 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
168 at OP, and appends a null terminator to the output.
170 Returns the output length if successful, -1 if the output buffer is too
173 try_recode (struct converter *cvtr, char fallbackchar,
174 const char *in, size_t inbytes,
175 char *out_, size_t outbytes)
180 int null_bytes = cvtr->null_char_width;
182 /* Put the converter into the initial shift state, in case there was any
183 state information left over from its last usage. */
184 iconv (cvtr->conv, NULL, 0, NULL, 0);
186 /* Do two rounds of iconv() calls:
188 - The first round does the bulk of the conversion using the
189 caller-supplied input data..
191 - The second round flushes any leftover output. This has a real effect
192 with input encodings that use combining diacritics, e.g. without the
193 second round the last character tends to gets dropped when converting
194 from windows-1258 to other encodings.
196 for (i = 0; i < 2; i++)
198 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
199 size_t *inbytesp = i ? NULL : &inbytes;
201 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
205 if (outbytes < null_bytes + 1)
209 *out++ = fallbackchar;
210 for (j = 0 ; j < null_bytes ; ++j)
212 return out - 1 - out_;
219 *out++ = fallbackchar;
232 /* should never happen */
233 fprintf (stderr, "Character conversion error: %s\n",
240 if (outbytes <= null_bytes - 1)
243 for (i = 0 ; i < null_bytes ; ++i)
246 return out - 1 - out_;
249 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
250 dynamically allocated string in TO-encoding. Any characters which cannot be
251 converted will be represented by '?'.
253 LENGTH should be the length of the string or -1, if null terminated.
255 The returned string will be allocated on POOL.
257 This function's behaviour differs from that of g_convert_with_fallback
258 provided by GLib. The GLib function will fail (returns NULL) if any part of
259 the input string is not valid in the declared input encoding. This function
260 however perseveres even in the presence of badly encoded input. */
262 recode_string_pool (const char *to, const char *from,
263 const char *text, int length, struct pool *pool)
265 struct substring out;
271 length = strlen (text);
273 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
277 /* Returns the name of the encoding that should be used for file names.
279 This is meant to be the same encoding used by g_filename_from_uri() and
280 g_filename_to_uri() in GLib. */
282 filename_encoding (void)
284 #if defined _WIN32 || defined __WIN32__
287 return locale_charset ();
292 xconcat2 (const char *a, size_t a_len,
293 const char *b, size_t b_len)
295 char *s = xmalloc (a_len + b_len + 1);
296 memcpy (s, a, a_len);
297 memcpy (s + a_len, b, b_len);
298 s[a_len + b_len] = '\0';
302 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
303 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
304 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
305 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
306 HEAD and tries again, repeating as necessary until the concatenated result
307 fits or until HEAD_LEN reaches 0.
309 [*] Actually this function drops grapheme clusters instead of characters, so
310 that, e.g. a Unicode character followed by a combining accent character
311 is either completely included or completely excluded from HEAD_LEN. See
312 UAX #29 at http://unicode.org/reports/tr29/ for more information on
315 A null ENCODING is treated as UTF-8.
317 Sometimes this function has to actually construct the concatenated string to
318 measure its length. When this happens, it sets *RESULTP to that
319 null-terminated string, allocated with malloc(), for the caller to use if it
320 needs it. Otherwise, it sets *RESULTP to NULL.
322 Simple examples for encoding="UTF-8", max_len=6:
324 head="abc", tail="xyz" => 3
325 head="abcd", tail="xyz" => 3 ("d" dropped).
326 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
327 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
329 Examples for encoding="ISO-8859-1", max_len=6:
331 head="éèä", tail="xyz" => 6
332 (each letter in head is only 1 byte in ISO-8859-1 even though they
333 each take 2 bytes in UTF-8 encoding)
336 utf8_encoding_concat__ (const char *head, size_t head_len,
337 const char *tail, size_t tail_len,
338 const char *encoding, size_t max_len,
344 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
346 if (head_len + tail_len <= max_len)
348 else if (tail_len >= max_len)
358 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
360 ofs <= max_len - tail_len;
365 mblen = u8_mbtouc (&next,
366 CHAR_CAST (const uint8_t *, head + ofs),
368 if (uc_is_grapheme_break (prev, next))
381 result = (tail_len > 0
382 ? xconcat2 (head, head_len, tail, tail_len)
383 : CONST_CAST (char *, head));
384 if (recode_string_len (encoding, "UTF-8", result,
385 head_len + tail_len) <= max_len)
387 *resultp = result != head ? result : NULL;
392 bool correct_result = false;
399 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
406 mblen = u8_mbtouc (&next,
407 CHAR_CAST (const uint8_t *, head + ofs),
409 if (uc_is_grapheme_break (prev, next))
413 memcpy (result, head, ofs);
414 memcpy (result + ofs, tail, tail_len);
415 result[ofs + tail_len] = '\0';
418 if (recode_string_len (encoding, "UTF-8", result,
419 ofs + tail_len) <= max_len)
421 correct_result = true;
425 correct_result = false;
444 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
445 null-terminated string owned by the caller. HEAD, TAIL, and the returned
446 string are all encoded in UTF-8. As many characters[*] from the beginning
447 of HEAD are included as will fit within MAX_LEN bytes supposing that the
448 resulting string were to be re-encoded in ENCODING. All of TAIL is always
449 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
451 [*] Actually this function drops grapheme clusters instead of characters, so
452 that, e.g. a Unicode character followed by a combining accent character
453 is either completely included or completely excluded from the returned
454 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
455 information on grapheme clusters.
457 A null ENCODING is treated as UTF-8.
459 Simple examples for encoding="UTF-8", max_len=6:
461 head="abc", tail="xyz" => "abcxyz"
462 head="abcd", tail="xyz" => "abcxyz"
463 head="abc", tail="uvwxyz" => "uvwxyz"
464 head="abc", tail="tuvwxyz" => "tuvwxyz"
466 Examples for encoding="ISO-8859-1", max_len=6:
468 head="éèä", tail="xyz" => "éèäxyz"
469 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
470 each take 2 bytes in UTF-8 encoding)
473 utf8_encoding_concat (const char *head, const char *tail,
474 const char *encoding, size_t max_len)
476 size_t tail_len = strlen (tail);
480 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
481 encoding, max_len, &result);
482 return (result != NULL
484 : xconcat2 (head, prefix_len, tail, tail_len));
487 /* Returns the length, in bytes, of the string that would be returned by
488 utf8_encoding_concat() if passed the same arguments, but the implementation
489 is often more efficient. */
491 utf8_encoding_concat_len (const char *head, const char *tail,
492 const char *encoding, size_t max_len)
494 size_t tail_len = strlen (tail);
498 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
499 encoding, max_len, &result);
501 return prefix_len + tail_len;
504 /* Returns an allocated, null-terminated string, owned by the caller,
505 containing as many characters[*] from the beginning of S that would fit
506 within MAX_LEN bytes if the returned string were to be re-encoded in
507 ENCODING. Both S and the returned string are encoded in UTF-8.
509 [*] Actually this function drops grapheme clusters instead of characters, so
510 that, e.g. a Unicode character followed by a combining accent character
511 is either completely included or completely excluded from the returned
512 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
513 information on grapheme clusters.
515 A null ENCODING is treated as UTF-8.
518 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
520 return utf8_encoding_concat (s, "", encoding, max_len);
523 /* Returns the length, in bytes, of the string that would be returned by
524 utf8_encoding_trunc() if passed the same arguments, but the implementation
525 is often more efficient. */
527 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
529 return utf8_encoding_concat_len (s, "", encoding, max_len);
532 /* Returns FILENAME converted from UTF-8 to the filename encoding.
533 On Windows the filename encoding is UTF-8; elsewhere it is based on the
536 utf8_to_filename (const char *filename)
538 return recode_string (filename_encoding (), "UTF-8", filename, -1);
541 /* Returns FILENAME converted from the filename encoding to UTF-8.
542 On Windows the filename encoding is UTF-8; elsewhere it is based on the
545 filename_to_utf8 (const char *filename)
547 return recode_string ("UTF-8", filename_encoding (), filename, -1);
551 recode_substring_pool__ (const char *to, const char *from,
552 struct substring text, char fallbackchar,
553 struct pool *pool, struct substring *out)
556 struct converter *conv;
559 to = default_encoding;
562 from = default_encoding;
564 conv = create_iconv (to, from, true);
570 out->string = pool_malloc (pool, text.length + 1);
571 out->length = text.length;
572 memcpy (out->string, text.string, text.length);
573 out->string[out->length] = '\0';
580 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
582 char *output = pool_malloc (pool, bufsize);
585 retval = try_recode (conv, fallbackchar, text.string, text.length,
589 *out = ss_buffer (output, retval);
592 pool_free (pool, output);
594 if (retval != -E2BIG)
601 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
602 dynamically allocated string in TO-encoding. Any characters which cannot be
603 converted will be represented by '?'.
605 The returned string will be null-terminated and allocated on POOL with
608 This function's behaviour differs from that of g_convert_with_fallback
609 provided by GLib. The GLib function will fail (returns NULL) if any part of
610 the input string is not valid in the declared input encoding. This function
611 however perseveres even in the presence of badly encoded input. */
613 recode_substring_pool (const char *to, const char *from,
614 struct substring text, struct pool *pool)
616 struct substring out;
618 recode_substring_pool__ (to, from, text, '?', pool, &out);
622 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
623 dynamically allocated string in TO-encoding. On success, returns 0, and the
624 converted null-terminated string, allocated from POOL with pool_malloc(), is
625 stored in *OUT. On failure, returns a positive errno value.
627 The function fails with an error if any part of the input string is not
628 valid in the declared input encoding. */
630 recode_pedantically (const char *to, const char *from,
631 struct substring text, struct pool *pool,
632 struct substring *out)
636 error = recode_substring_pool__ (to, from, text, 0, pool, out);
645 setlocale (LC_ALL, "");
647 bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
649 textdomain (PACKAGE);
651 assert (default_encoding == NULL);
652 default_encoding = xstrdup (locale_charset ());
658 get_default_encoding (void)
660 return default_encoding;
664 set_default_encoding (const char *enc)
666 free (default_encoding);
667 default_encoding = xstrdup (enc);
670 /* Return the ISO two letter code for the current LC_MESSAGES
675 const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
676 if (0 == strcmp (localename, "C"))
678 char *ln = xstrdup (localename);
679 char *end = strchr (ln, '_');
686 /* Attempts to set the encoding from a locale name
687 returns true if successful.
688 This function does not (should not!) alter the current locale.
691 set_encoding_from_locale (const char *loc)
696 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
698 setlocale (LC_CTYPE, "C");
699 c_encoding = xstrdup (locale_charset ());
701 setlocale (LC_CTYPE, loc);
702 loc_encoding = xstrdup (locale_charset ());
705 if (0 == strcmp (loc_encoding, c_encoding))
710 setlocale (LC_CTYPE, tmp);
716 free (default_encoding);
717 default_encoding = loc_encoding;
730 struct hmapx_node *node;
731 struct converter *cvtr;
733 HMAPX_FOR_EACH (cvtr, node, &map)
738 free (cvtr->fromcode);
739 if (cvtr->conv != (iconv_t) -1)
740 iconv_close (cvtr->conv);
744 hmapx_destroy (&map);
746 free (default_encoding);
747 default_encoding = NULL;
753 valid_encoding (const char *enc)
755 iconv_t conv = iconv_open (UTF8, enc);
757 if (conv == (iconv_t) -1)
766 /* Return the system local's idea of the
767 decimal separator character */
769 get_system_decimal (void)
774 radix_char = nl_langinfo (RADIXCHAR)[0];
778 snprintf (buf, sizeof buf, "%f", 2.5);
787 uc_name (ucs4_t uc, char buffer[16])
789 if (uc >= 0x20 && uc < 0x7f)
790 snprintf (buffer, 16, "`%c'", uc);
792 snprintf (buffer, 16, "U+%04X", uc);
796 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
798 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
799 with lowercase and uppercase letters treated as equal, starting from
802 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
804 uint8_t folded_buf[2048];
805 size_t folded_len = sizeof folded_buf;
809 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
810 NULL, UNINORM_NFKD, folded_buf, &folded_len);
811 if (folded_s != NULL)
813 hash = hash_bytes (folded_s, folded_len, basis);
814 if (folded_s != folded_buf)
821 hash = hash_bytes (s, n, basis);
827 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
828 uppercase letters treated as equal, starting from BASIS. */
830 utf8_hash_case_string (const char *s, unsigned int basis)
832 return utf8_hash_case_bytes (s, strlen (s), basis);
835 /* Compares UTF-8 strings A and B case-insensitively.
836 Returns a negative value if A < B, zero if A == B, positive if A > B. */
838 utf8_strcasecmp (const char *a, const char *b)
840 return utf8_strncasecmp (a, strlen (a), b, strlen (b));
843 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
845 Returns a negative value if A < B, zero if A == B, positive if A > B. */
847 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
851 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
852 CHAR_CAST (const uint8_t *, b), bn,
853 NULL, UNINORM_NFKD, &result))
858 result = memcmp (a, b, MIN (an, bn));
860 result = an < bn ? -1 : an > bn;
867 is_all_digits (const uint8_t *s, size_t len)
869 for (size_t i = 0; i < len; i++)
870 if (!c_isdigit (s[i]))
875 /* Compares UTF-8 strings A and B case-insensitively. If the strings end in a
876 number, then they are compared numerically. Returns a negative value if A <
877 B, zero if A == B, positive if A > B. */
879 utf8_strverscasecmp (const char *a, const char *b)
883 size_t a_len = sizeof a_stub;
884 uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
885 UNINORM_NFKD, a_stub, &a_len);
889 size_t b_len = sizeof b_stub;
890 uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
891 UNINORM_NFKD, b_stub, &b_len);
894 if (!a_norm || !b_norm)
896 result = strcmp (a, b);
900 size_t len = MIN (a_len, b_len);
901 for (size_t i = 0; i < len; i++)
902 if (a_norm[i] != b_norm[i])
904 /* If both strings end in digits, compare them numerically. */
905 if (is_all_digits (&a_norm[i], a_len - i)
906 && is_all_digits (&b_norm[i], b_len - i))
908 /* Start by stripping leading zeros, since those don't matter for
909 numerical comparison. */
911 for (ap = i; ap < a_len; ap++)
912 if (a_norm[ap] != '0')
914 for (bp = i; bp < b_len; bp++)
915 if (b_norm[bp] != '0')
918 /* The number with more digits, if there is one, is larger. */
919 size_t a_digits = a_len - ap;
920 size_t b_digits = b_len - bp;
921 if (a_digits != b_digits)
922 result = a_digits > b_digits ? 1 : -1;
924 result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
927 result = a_norm[i] > b_norm[i] ? 1 : -1;
930 result = a_len < b_len ? -1 : a_len > b_len;
933 if (a_norm != a_stub)
935 if (b_norm != b_stub)
941 utf8_casemap (const char *s,
942 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
943 uint8_t *, size_t *))
948 result = CHAR_CAST (char *,
949 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
950 NULL, NULL, NULL, &size));
956 result = xstrdup (s);
962 utf8_to_upper (const char *s)
964 return utf8_casemap (s, u8_toupper);
968 utf8_to_lower (const char *s)
970 return utf8_casemap (s, u8_tolower);
974 utf8_to_title (const char *s)
976 return utf8_casemap (s, u8_totitle);
980 get_encoding_info (struct encoding_info *e, const char *name)
982 const struct substring in = SS_LITERAL_INITIALIZER (
984 "!\"#$%&'()*+,-./0123456789:;<=>?@"
985 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
986 "abcdefghijklmnopqrstuvwxyz{|}~");
988 struct substring out, cr, lf, space;
991 memset (e, 0, sizeof *e);
993 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
994 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
995 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
997 && cr.length <= MAX_UNIT
998 && cr.length == lf.length
999 && cr.length == space.length);
1002 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1005 ss_dealloc (&space);
1006 ss_alloc_substring (&cr, ss_cstr ("\r"));
1007 ss_alloc_substring (&lf, ss_cstr ("\n"));
1008 ss_alloc_substring (&space, ss_cstr (" "));
1011 e->unit = cr.length;
1012 memcpy (e->cr, cr.string, e->unit);
1013 memcpy (e->lf, lf.string, e->unit);
1014 memcpy (e->space, space.string, e->unit);
1018 ss_dealloc (&space);
1020 out = recode_substring_pool ("UTF-8", name, in, NULL);
1021 e->is_ascii_compatible = ss_equals (in, out);
1024 if (!e->is_ascii_compatible && e->unit == 1)
1026 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1027 e->is_ebcdic_compatible = (out.length == 1
1028 && (uint8_t) out.string[0] == 0xc1);
1032 e->is_ebcdic_compatible = false;
1038 is_encoding_ascii_compatible (const char *encoding)
1040 struct encoding_info e;
1042 get_encoding_info (&e, encoding);
1043 return e.is_ascii_compatible;
1047 is_encoding_ebcdic_compatible (const char *encoding)
1049 struct encoding_info e;
1051 get_encoding_info (&e, encoding);
1052 return e.is_ebcdic_compatible;
1055 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1058 is_encoding_supported (const char *encoding)
1060 return (create_iconv ("UTF-8", encoding, false)
1061 && create_iconv (encoding, "UTF-8", false));
1064 /* Returns true if E is the name of a UTF-8 encoding.
1066 XXX Possibly we should test not E as a string but its properties via
1069 is_encoding_utf8 (const char *e)
1071 return ((e[0] == 'u' || e[0] == 'U')
1072 && (e[1] == 't' || e[1] == 'T')
1073 && (e[2] == 'f' || e[2] == 'F')
1074 && ((e[3] == '8' && e[4] == '\0')
1075 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1078 static struct encoding_category *categories;
1079 static int n_categories;
1081 static void SENTINEL (0)
1082 add_category (size_t *allocated_categories, const char *category, ...)
1084 struct encoding_category *c;
1085 const char *encodings[16];
1089 /* Count encoding arguments. */
1090 va_start (args, category);
1092 while ((encodings[n] = va_arg (args, const char *)) != NULL)
1094 const char *encoding = encodings[n];
1095 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1098 assert (n < sizeof encodings / sizeof *encodings);
1104 if (n_categories >= *allocated_categories)
1105 categories = x2nrealloc (categories,
1106 allocated_categories, sizeof *categories);
1108 c = &categories[n_categories++];
1109 c->category = category;
1110 c->encodings = xmalloc (n * sizeof *c->encodings);
1111 for (i = 0; i < n; i++)
1112 c->encodings[i] = encodings[i];
1117 init_encoding_categories (void)
1127 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1128 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1129 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1131 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1132 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1133 "Windows-1257", NULL_SENTINEL);
1134 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1135 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1136 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1137 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1138 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1139 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1140 "EUC-TW", NULL_SENTINEL);
1141 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1142 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1143 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1144 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1145 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1147 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1148 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1149 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1150 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1151 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1153 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1154 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1155 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1156 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1158 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1160 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1161 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1163 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1164 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1166 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1168 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1169 "Windows-1258", NULL_SENTINEL);
1170 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1171 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1174 /* Returns an array of "struct encoding_category" that contains only the
1175 categories and encodings that the system supports. */
1176 struct encoding_category *
1177 get_encoding_categories (void)
1179 init_encoding_categories ();
1183 /* Returns the number of elements in the array returned by
1184 get_encoding_categories(). */
1186 get_n_encoding_categories (void)
1188 init_encoding_categories ();
1189 return n_categories;