1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
3 2016, 2021 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "libpspp/i18n.h"
34 #include "libpspp/assertion.h"
35 #include "libpspp/compiler.h"
36 #include "libpspp/hmapx.h"
37 #include "libpspp/hash-functions.h"
38 #include "libpspp/misc.h"
39 #include "libpspp/pool.h"
40 #include "libpspp/str.h"
41 #include "libpspp/version.h"
43 #include "gl/c-ctype.h"
44 #include "gl/c-strcase.h"
45 #include "gl/localcharset.h"
46 #include <gl/localename.h>
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
49 #include "gl/relocatable.h"
50 #include "gl/xstrndup.h"
53 #define _(msgid) gettext (msgid)
63 static char *default_encoding;
64 static struct hmapx map;
66 /* A wrapper around iconv_open */
67 static struct converter *
68 create_iconv (const char* tocode, const char* fromcode, bool warn)
71 struct hmapx_node *node;
72 struct converter *converter;
75 hash = hash_string (tocode, hash_string (fromcode, 0));
76 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
81 if (!strcmp (tocode, converter->tocode)
82 && !strcmp (fromcode, converter->fromcode))
86 converter = xmalloc (sizeof *converter);
87 converter->tocode = xstrdup (tocode);
88 converter->fromcode = xstrdup (fromcode);
89 converter->conv = iconv_open (tocode, fromcode);
90 int error = converter->conv == (iconv_t) ~0 ? errno : 0;
91 /* I don't think it's safe to translate this string or to use messaging
92 as the converters have not yet been set up */
93 if (error && strcmp (tocode, fromcode))
98 "cannot create a converter for `%s' to `%s': %s\n",
99 fromcode, tocode, strerror (error));
101 free (converter->tocode);
102 free (converter->fromcode);
105 hmapx_insert (&map, NULL, hash);
109 /* Find out how many bytes there are in a null char in the target
111 iconv_t bconv = iconv_open (tocode, "ASCII");
112 if (bconv != (iconv_t) -1)
114 ICONV_CONST char inbuf[1] = "";
115 ICONV_CONST char *inptr = inbuf;
116 size_t inbytes = sizeof inbuf;
119 char *outptr = outbuf;
120 size_t outbytes = sizeof outbuf;
121 if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
122 converter->null_char_width = outptr - outbuf;
126 hmapx_insert (&map, converter, hash);
132 /* Converts the single byte C from encoding FROM to TO, returning the first
135 This function probably shouldn't be used at all, but some code still does
138 recode_byte (const char *to, const char *from, char c)
141 char *s = recode_string (to, from, &c, 1);
147 /* Similar to recode_string_pool, but allocates the returned value on the heap
148 instead of in a pool. It is the caller's responsibility to free the
151 recode_string (const char *to, const char *from,
152 const char *text, int length)
154 return recode_string_pool (to, from, text, length, NULL);
157 /* Returns the length, in bytes, of the string that a similar recode_string()
158 call would return. */
160 recode_string_len (const char *to, const char *from,
161 const char *text, int length)
163 char *s = recode_string (to, from, text, length);
164 size_t len = strlen (s);
169 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
170 at OP, and appends a null terminator to the output.
172 Returns the output length if successful, -1 if the output buffer is too
175 try_recode (struct converter *cvtr, char fallbackchar,
176 const char *in, size_t inbytes,
177 char *out_, size_t outbytes)
182 int null_bytes = cvtr->null_char_width;
184 /* Put the converter into the initial shift state, in case there was any
185 state information left over from its last usage. */
186 iconv (cvtr->conv, NULL, 0, NULL, 0);
188 /* Do two rounds of iconv() calls:
190 - The first round does the bulk of the conversion using the
191 caller-supplied input data..
193 - The second round flushes any leftover output. This has a real effect
194 with input encodings that use combining diacritics, e.g. without the
195 second round the last character tends to gets dropped when converting
196 from windows-1258 to other encodings.
198 for (i = 0; i < 2; i++)
200 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
201 size_t *inbytesp = i ? NULL : &inbytes;
203 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
207 if (outbytes < null_bytes + 1)
211 *out++ = fallbackchar;
212 for (j = 0 ; j < null_bytes ; ++j)
214 return out - 1 - out_;
221 *out++ = fallbackchar;
234 /* should never happen */
235 fprintf (stderr, "Character conversion error: %s\n",
242 if (outbytes <= null_bytes - 1)
245 for (i = 0 ; i < null_bytes ; ++i)
248 return out - 1 - out_;
251 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
252 dynamically allocated string in TO-encoding. Any characters which cannot be
253 converted will be represented by '?'.
255 LENGTH should be the length of the string or -1, if null terminated.
257 The returned string will be allocated on POOL.
259 This function's behaviour differs from that of g_convert_with_fallback
260 provided by GLib. The GLib function will fail (returns NULL) if any part of
261 the input string is not valid in the declared input encoding. This function
262 however perseveres even in the presence of badly encoded input. */
264 recode_string_pool (const char *to, const char *from,
265 const char *text, int length, struct pool *pool)
267 struct substring out;
273 length = strlen (text);
275 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
279 /* Returns the name of the encoding that should be used for file names.
281 This is meant to be the same encoding used by g_filename_from_uri() and
282 g_filename_to_uri() in GLib. */
284 filename_encoding (void)
286 #if defined _WIN32 || defined __WIN32__
289 return locale_charset ();
294 xconcat2 (const char *a, size_t a_len,
295 const char *b, size_t b_len)
297 char *s = xmalloc (a_len + b_len + 1);
298 memcpy (s, a, a_len);
299 memcpy (s + a_len, b, b_len);
300 s[a_len + b_len] = '\0';
304 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
305 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
306 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
307 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
308 HEAD and tries again, repeating as necessary until the concatenated result
309 fits or until HEAD_LEN reaches 0.
311 [*] Actually this function drops grapheme clusters instead of characters, so
312 that, e.g. a Unicode character followed by a combining accent character
313 is either completely included or completely excluded from HEAD_LEN. See
314 UAX #29 at http://unicode.org/reports/tr29/ for more information on
317 A null ENCODING is treated as UTF-8.
319 Sometimes this function has to actually construct the concatenated string to
320 measure its length. When this happens, it sets *RESULTP to that
321 null-terminated string, allocated with malloc(), for the caller to use if it
322 needs it. Otherwise, it sets *RESULTP to NULL.
324 Simple examples for encoding="UTF-8", max_len=6:
326 head="abc", tail="xyz" => 3
327 head="abcd", tail="xyz" => 3 ("d" dropped).
328 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
329 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
331 Examples for encoding="ISO-8859-1", max_len=6:
333 head="éèä", tail="xyz" => 6
334 (each letter in head is only 1 byte in ISO-8859-1 even though they
335 each take 2 bytes in UTF-8 encoding)
338 utf8_encoding_concat__ (const char *head, size_t head_len,
339 const char *tail, size_t tail_len,
340 const char *encoding, size_t max_len,
346 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
348 if (head_len + tail_len <= max_len)
350 else if (tail_len >= max_len)
360 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
362 ofs <= max_len - tail_len;
367 mblen = u8_mbtouc (&next,
368 CHAR_CAST (const uint8_t *, head + ofs),
370 if (uc_is_grapheme_break (prev, next))
383 result = (tail_len > 0
384 ? xconcat2 (head, head_len, tail, tail_len)
385 : CONST_CAST (char *, head));
386 if (recode_string_len (encoding, "UTF-8", result,
387 head_len + tail_len) <= max_len)
389 *resultp = result != head ? result : NULL;
394 bool correct_result = false;
401 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
408 mblen = u8_mbtouc (&next,
409 CHAR_CAST (const uint8_t *, head + ofs),
411 if (uc_is_grapheme_break (prev, next))
415 memcpy (result, head, ofs);
416 memcpy (result + ofs, tail, tail_len);
417 result[ofs + tail_len] = '\0';
420 if (recode_string_len (encoding, "UTF-8", result,
421 ofs + tail_len) <= max_len)
423 correct_result = true;
427 correct_result = false;
446 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
447 null-terminated string owned by the caller. HEAD, TAIL, and the returned
448 string are all encoded in UTF-8. As many characters[*] from the beginning
449 of HEAD are included as will fit within MAX_LEN bytes supposing that the
450 resulting string were to be re-encoded in ENCODING. All of TAIL is always
451 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
453 [*] Actually this function drops grapheme clusters instead of characters, so
454 that, e.g. a Unicode character followed by a combining accent character
455 is either completely included or completely excluded from the returned
456 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
457 information on grapheme clusters.
459 A null ENCODING is treated as UTF-8.
461 Simple examples for encoding="UTF-8", max_len=6:
463 head="abc", tail="xyz" => "abcxyz"
464 head="abcd", tail="xyz" => "abcxyz"
465 head="abc", tail="uvwxyz" => "uvwxyz"
466 head="abc", tail="tuvwxyz" => "tuvwxyz"
468 Examples for encoding="ISO-8859-1", max_len=6:
470 head="éèä", tail="xyz" => "éèäxyz"
471 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
472 each take 2 bytes in UTF-8 encoding)
475 utf8_encoding_concat (const char *head, const char *tail,
476 const char *encoding, size_t max_len)
478 size_t tail_len = strlen (tail);
482 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
483 encoding, max_len, &result);
484 return (result != NULL
486 : xconcat2 (head, prefix_len, tail, tail_len));
489 /* Returns the length, in bytes, of the string that would be returned by
490 utf8_encoding_concat() if passed the same arguments, but the implementation
491 is often more efficient. */
493 utf8_encoding_concat_len (const char *head, const char *tail,
494 const char *encoding, size_t max_len)
496 size_t tail_len = strlen (tail);
500 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
501 encoding, max_len, &result);
503 return prefix_len + tail_len;
506 /* Returns the number of display columns that would be occupied by the LENGTH
507 bytes of UTF-8 starting at S. */
509 utf8_count_columns (const char *s_, size_t length)
511 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
514 for (int ofs = 0; ofs < length; )
517 ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
520 int width = uc_width (uc, "UTF-8");
525 columns = ROUND_UP (columns + 1, 8);
530 /* Returns the byte offset in LENGTH-byte UTF-8 string S that is N_COLUMNS
531 display columns into the string. */
533 utf8_columns_to_bytes (const char *s_, size_t length, size_t n_columns)
535 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
539 for (ofs = 0; ofs < length && columns < n_columns; )
542 ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
545 int width = uc_width (uc, "UTF-8");
550 columns = ROUND_UP (columns + 1, 8);
555 /* Returns an allocated, null-terminated string, owned by the caller,
556 containing as many characters[*] from the beginning of S that would fit
557 within MAX_LEN bytes if the returned string were to be re-encoded in
558 ENCODING. Both S and the returned string are encoded in UTF-8.
560 [*] Actually this function drops grapheme clusters instead of characters, so
561 that, e.g. a Unicode character followed by a combining accent character
562 is either completely included or completely excluded from the returned
563 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
564 information on grapheme clusters.
566 A null ENCODING is treated as UTF-8.
569 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
571 return utf8_encoding_concat (s, "", encoding, max_len);
574 /* Returns the length, in bytes, of the string that would be returned by
575 utf8_encoding_trunc() if passed the same arguments, but the implementation
576 is often more efficient. */
578 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
580 return utf8_encoding_concat_len (s, "", encoding, max_len);
583 /* Returns FILENAME converted from UTF-8 to the filename encoding.
584 On Windows the filename encoding is UTF-8; elsewhere it is based on the
587 utf8_to_filename (const char *filename)
589 return recode_string (filename_encoding (), "UTF-8", filename, -1);
592 /* Returns FILENAME converted from the filename encoding to UTF-8.
593 On Windows the filename encoding is UTF-8; elsewhere it is based on the
596 filename_to_utf8 (const char *filename)
598 return recode_string ("UTF-8", filename_encoding (), filename, -1);
602 recode_substring_pool__ (const char *to, const char *from,
603 struct substring text, char fallbackchar,
604 struct pool *pool, struct substring *out)
607 struct converter *conv;
610 to = default_encoding;
613 from = default_encoding;
615 conv = create_iconv (to, from, true);
621 out->string = pool_malloc (pool, text.length + 1);
622 out->length = text.length;
623 memcpy (out->string, text.string, text.length);
624 out->string[out->length] = '\0';
631 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
633 char *output = pool_malloc (pool, bufsize);
636 retval = try_recode (conv, fallbackchar, text.string, text.length,
640 *out = ss_buffer (output, retval);
643 pool_free (pool, output);
645 if (retval != -E2BIG)
652 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
653 dynamically allocated string in TO-encoding. Any characters which cannot be
654 converted will be represented by '?'.
656 The returned string will be null-terminated and allocated on POOL with
659 This function's behaviour differs from that of g_convert_with_fallback
660 provided by GLib. The GLib function will fail (returns NULL) if any part of
661 the input string is not valid in the declared input encoding. This function
662 however perseveres even in the presence of badly encoded input. */
664 recode_substring_pool (const char *to, const char *from,
665 struct substring text, struct pool *pool)
667 struct substring out;
669 recode_substring_pool__ (to, from, text, '?', pool, &out);
673 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
674 dynamically allocated string in TO-encoding. On success, returns 0, and the
675 converted null-terminated string, allocated from POOL with pool_malloc(), is
676 stored in *OUT. On failure, returns a positive errno value.
678 The function fails with an error if any part of the input string is not
679 valid in the declared input encoding. */
681 recode_pedantically (const char *to, const char *from,
682 struct substring text, struct pool *pool,
683 struct substring *out)
687 error = recode_substring_pool__ (to, from, text, 0, pool, out);
696 setlocale (LC_ALL, "");
698 bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
700 textdomain (PACKAGE);
702 assert (default_encoding == NULL);
703 default_encoding = xstrdup (locale_charset ());
709 get_default_encoding (void)
711 return default_encoding;
715 set_default_encoding (const char *enc)
717 free (default_encoding);
718 default_encoding = xstrdup (enc);
721 /* Return the ISO two letter code for the current LC_MESSAGES
726 const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
727 if (0 == strcmp (localename, "C"))
729 char *ln = xstrdup (localename);
730 char *end = strchr (ln, '_');
737 /* Attempts to set the encoding from a locale name
738 returns true if successful.
739 This function does not (should not!) alter the current locale.
742 set_encoding_from_locale (const char *loc)
747 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
749 setlocale (LC_CTYPE, "C");
750 c_encoding = xstrdup (locale_charset ());
752 setlocale (LC_CTYPE, loc);
753 loc_encoding = xstrdup (locale_charset ());
756 if (0 == strcmp (loc_encoding, c_encoding))
761 setlocale (LC_CTYPE, tmp);
767 free (default_encoding);
768 default_encoding = loc_encoding;
781 struct hmapx_node *node;
782 struct converter *cvtr;
784 HMAPX_FOR_EACH (cvtr, node, &map)
789 free (cvtr->fromcode);
790 if (cvtr->conv != (iconv_t) -1)
791 iconv_close (cvtr->conv);
795 hmapx_destroy (&map);
797 free (default_encoding);
798 default_encoding = NULL;
804 valid_encoding (const char *enc)
806 iconv_t conv = iconv_open (UTF8, enc);
808 if (conv == (iconv_t) -1)
817 /* Return the system local's idea of the
818 decimal separator character */
820 get_system_decimal (void)
825 radix_char = nl_langinfo (RADIXCHAR)[0];
829 snprintf (buf, sizeof buf, "%f", 2.5);
838 uc_name (ucs4_t uc, char buffer[16])
840 if (uc >= 0x20 && uc < 0x7f)
841 snprintf (buffer, 16, "`%c'", uc);
843 snprintf (buffer, 16, "U+%04X", uc);
847 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
849 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
850 with lowercase and uppercase letters treated as equal, starting from
853 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
855 uint8_t folded_buf[2048];
856 size_t folded_len = sizeof folded_buf;
860 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
861 NULL, UNINORM_NFKD, folded_buf, &folded_len);
862 if (folded_s != NULL)
864 hash = hash_bytes (folded_s, folded_len, basis);
865 if (folded_s != folded_buf)
872 hash = hash_bytes (s, n, basis);
878 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
879 uppercase letters treated as equal, starting from BASIS. */
881 utf8_hash_case_string (const char *s, unsigned int basis)
883 return utf8_hash_case_bytes (s, strlen (s), basis);
886 /* Compares UTF-8 strings A and B case-insensitively.
887 Returns a negative value if A < B, zero if A == B, positive if A > B. */
889 utf8_strcasecmp (const char *a, const char *b)
891 return utf8_strncasecmp (a, strlen (a), b, strlen (b));
894 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
896 Returns a negative value if A < B, zero if A == B, positive if A > B. */
898 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
902 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
903 CHAR_CAST (const uint8_t *, b), bn,
904 NULL, UNINORM_NFKD, &result))
909 result = memcmp (a, b, MIN (an, bn));
911 result = an < bn ? -1 : an > bn;
918 is_all_digits (const uint8_t *s, size_t len)
920 for (size_t i = 0; i < len; i++)
921 if (!c_isdigit (s[i]))
926 /* Compares UTF-8 strings A and B case-insensitively. If the strings end in a
927 number, then they are compared numerically. Returns a negative value if A <
928 B, zero if A == B, positive if A > B. */
930 utf8_strverscasecmp (const char *a, const char *b)
934 size_t a_len = sizeof a_stub;
935 uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
936 UNINORM_NFKD, a_stub, &a_len);
940 size_t b_len = sizeof b_stub;
941 uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
942 UNINORM_NFKD, b_stub, &b_len);
945 if (!a_norm || !b_norm)
947 result = strcmp (a, b);
951 size_t len = MIN (a_len, b_len);
952 for (size_t i = 0; i < len; i++)
953 if (a_norm[i] != b_norm[i])
955 /* If both strings end in digits, compare them numerically. */
956 if (is_all_digits (&a_norm[i], a_len - i)
957 && is_all_digits (&b_norm[i], b_len - i))
959 /* Start by stripping leading zeros, since those don't matter for
960 numerical comparison. */
962 for (ap = i; ap < a_len; ap++)
963 if (a_norm[ap] != '0')
965 for (bp = i; bp < b_len; bp++)
966 if (b_norm[bp] != '0')
969 /* The number with more digits, if there is one, is larger. */
970 size_t a_digits = a_len - ap;
971 size_t b_digits = b_len - bp;
972 if (a_digits != b_digits)
973 result = a_digits > b_digits ? 1 : -1;
975 result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
978 result = a_norm[i] > b_norm[i] ? 1 : -1;
981 result = a_len < b_len ? -1 : a_len > b_len;
984 if (a_norm != a_stub)
986 if (b_norm != b_stub)
992 utf8_casemap (const char *s,
993 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
994 uint8_t *, size_t *))
999 result = CHAR_CAST (char *,
1000 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
1001 NULL, NULL, NULL, &size));
1004 if (errno == ENOMEM)
1007 result = xstrdup (s);
1013 utf8_to_upper (const char *s)
1015 return utf8_casemap (s, u8_toupper);
1019 utf8_to_lower (const char *s)
1021 return utf8_casemap (s, u8_tolower);
1025 utf8_to_title (const char *s)
1027 return utf8_casemap (s, u8_totitle);
1031 get_encoding_info (struct encoding_info *e, const char *name)
1033 const struct substring in = SS_LITERAL_INITIALIZER (
1035 "!\"#$%&'()*+,-./0123456789:;<=>?@"
1036 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
1037 "abcdefghijklmnopqrstuvwxyz{|}~");
1039 struct substring out, cr, lf, space;
1042 memset (e, 0, sizeof *e);
1044 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
1045 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
1046 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
1047 ok = (cr.length >= 1
1048 && cr.length <= MAX_UNIT
1049 && cr.length == lf.length
1050 && cr.length == space.length);
1053 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1056 ss_dealloc (&space);
1057 ss_alloc_substring (&cr, ss_cstr ("\r"));
1058 ss_alloc_substring (&lf, ss_cstr ("\n"));
1059 ss_alloc_substring (&space, ss_cstr (" "));
1062 e->unit = cr.length;
1063 memcpy (e->cr, cr.string, e->unit);
1064 memcpy (e->lf, lf.string, e->unit);
1065 memcpy (e->space, space.string, e->unit);
1069 ss_dealloc (&space);
1071 out = recode_substring_pool ("UTF-8", name, in, NULL);
1072 e->is_ascii_compatible = ss_equals (in, out);
1075 if (!e->is_ascii_compatible && e->unit == 1)
1077 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1078 e->is_ebcdic_compatible = (out.length == 1
1079 && (uint8_t) out.string[0] == 0xc1);
1083 e->is_ebcdic_compatible = false;
1089 is_encoding_ascii_compatible (const char *encoding)
1091 struct encoding_info e;
1093 get_encoding_info (&e, encoding);
1094 return e.is_ascii_compatible;
1098 is_encoding_ebcdic_compatible (const char *encoding)
1100 struct encoding_info e;
1102 get_encoding_info (&e, encoding);
1103 return e.is_ebcdic_compatible;
1106 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1109 is_encoding_supported (const char *encoding)
1111 return (create_iconv ("UTF-8", encoding, false)
1112 && create_iconv (encoding, "UTF-8", false));
1115 /* Returns true if E is the name of a UTF-8 encoding.
1117 XXX Possibly we should test not E as a string but its properties via
1120 is_encoding_utf8 (const char *e)
1122 return ((e[0] == 'u' || e[0] == 'U')
1123 && (e[1] == 't' || e[1] == 'T')
1124 && (e[2] == 'f' || e[2] == 'F')
1125 && ((e[3] == '8' && e[4] == '\0')
1126 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1129 static struct encoding_category *categories;
1130 static int n_categories;
1132 static void SENTINEL (0)
1133 add_category (size_t *allocated_categories, const char *category, ...)
1135 struct encoding_category *c;
1136 const char *encodings[16];
1140 /* Count encoding arguments. */
1141 va_start (args, category);
1143 while ((encodings[n] = va_arg (args, const char *)) != NULL)
1145 const char *encoding = encodings[n];
1146 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1149 assert (n < sizeof encodings / sizeof *encodings);
1155 if (n_categories >= *allocated_categories)
1156 categories = x2nrealloc (categories,
1157 allocated_categories, sizeof *categories);
1159 c = &categories[n_categories++];
1160 c->category = category;
1161 c->encodings = xmalloc (n * sizeof *c->encodings);
1162 for (i = 0; i < n; i++)
1163 c->encodings[i] = encodings[i];
1168 init_encoding_categories (void)
1178 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1179 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1180 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1182 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1183 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1184 "Windows-1257", NULL_SENTINEL);
1185 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1186 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1187 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1188 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1189 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1190 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1191 "EUC-TW", NULL_SENTINEL);
1192 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1193 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1194 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1195 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1196 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1198 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1199 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1200 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1201 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1202 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1204 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1205 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1206 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1207 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1209 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1211 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1212 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1214 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1215 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1217 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1219 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1220 "Windows-1258", NULL_SENTINEL);
1221 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1222 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1225 /* Returns an array of "struct encoding_category" that contains only the
1226 categories and encodings that the system supports. */
1227 struct encoding_category *
1228 get_encoding_categories (void)
1230 init_encoding_categories ();
1234 /* Returns the number of elements in the array returned by
1235 get_encoding_categories(). */
1237 get_n_encoding_categories (void)
1239 init_encoding_categories ();
1240 return n_categories;