1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
3 2016, 2021 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "libpspp/i18n.h"
34 #include "libpspp/assertion.h"
35 #include "libpspp/compiler.h"
36 #include "libpspp/hmapx.h"
37 #include "libpspp/hash-functions.h"
38 #include "libpspp/misc.h"
39 #include "libpspp/pool.h"
40 #include "libpspp/str.h"
41 #include "libpspp/version.h"
43 #include "gl/c-ctype.h"
44 #include "gl/c-strcase.h"
45 #include "gl/localcharset.h"
46 #include <gl/localename.h>
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
49 #include "gl/relocatable.h"
50 #include "gl/xstrndup.h"
53 #define _(msgid) gettext (msgid)
63 static char *default_encoding;
64 static struct hmapx map;
66 /* A wrapper around iconv_open */
67 static struct converter *
68 create_iconv (const char* tocode, const char* fromcode, bool warn)
71 struct hmapx_node *node;
72 struct converter *converter;
75 hash = hash_string (tocode, hash_string (fromcode, 0));
76 HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
81 if (!strcmp (tocode, converter->tocode)
82 && !strcmp (fromcode, converter->fromcode))
86 converter = xmalloc (sizeof *converter);
87 converter->tocode = xstrdup (tocode);
88 converter->fromcode = xstrdup (fromcode);
89 converter->conv = iconv_open (tocode, fromcode);
90 int error = converter->conv == (iconv_t) ~0 ? errno : 0;
91 /* I don't think it's safe to translate this string or to use messaging
92 as the converters have not yet been set up */
93 if (error && strcmp (tocode, fromcode))
98 "cannot create a converter for `%s' to `%s': %s\n",
99 fromcode, tocode, strerror (error));
101 free (converter->tocode);
102 free (converter->fromcode);
105 hmapx_insert (&map, NULL, hash);
109 /* Find out how many bytes there are in a null char in the target
111 iconv_t bconv = iconv_open (tocode, "ASCII");
112 if (bconv != (iconv_t) -1)
114 ICONV_CONST char inbuf[1] = "";
115 ICONV_CONST char *inptr = inbuf;
116 size_t inbytes = sizeof inbuf;
119 char *outptr = outbuf;
120 size_t outbytes = sizeof outbuf;
121 if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
122 converter->null_char_width = outptr - outbuf;
126 hmapx_insert (&map, converter, hash);
132 /* Converts the single byte C from encoding FROM to TO, returning the first
135 This function probably shouldn't be used at all, but some code still does
138 recode_byte (const char *to, const char *from, char c)
141 char *s = recode_string (to, from, &c, 1);
147 /* Similar to recode_string_pool, but allocates the returned value on the heap
148 instead of in a pool. It is the caller's responsibility to free the
151 recode_string (const char *to, const char *from,
152 const char *text, int length)
154 return recode_string_pool (to, from, text, length, NULL);
157 /* Returns the length, in bytes, of the string that a similar recode_string()
158 call would return. */
160 recode_string_len (const char *to, const char *from,
161 const char *text, int length)
163 char *s = recode_string (to, from, text, length);
164 size_t len = strlen (s);
169 /* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
170 at OP, and appends a null terminator to the output.
172 Returns the output length if successful, -1 if the output buffer is too
175 try_recode (struct converter *cvtr, char fallbackchar,
176 const char *in, size_t inbytes,
177 char *out_, size_t outbytes)
182 int null_bytes = cvtr->null_char_width;
184 /* Put the converter into the initial shift state, in case there was any
185 state information left over from its last usage. */
186 iconv (cvtr->conv, NULL, 0, NULL, 0);
188 /* Do two rounds of iconv() calls:
190 - The first round does the bulk of the conversion using the
191 caller-supplied input data..
193 - The second round flushes any leftover output. This has a real effect
194 with input encodings that use combining diacritics, e.g. without the
195 second round the last character tends to gets dropped when converting
196 from windows-1258 to other encodings.
198 for (i = 0; i < 2; i++)
200 ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈
201 size_t *inbytesp = i ? NULL : &inbytes;
203 while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1)
207 if (outbytes < null_bytes + 1)
211 *out++ = fallbackchar;
212 for (j = 0 ; j < null_bytes ; ++j)
214 return out - 1 - out_;
221 *out++ = fallbackchar;
234 /* should never happen */
235 fprintf (stderr, "Character conversion error: %s\n",
242 if (outbytes <= null_bytes - 1)
245 for (i = 0 ; i < null_bytes ; ++i)
248 return out - 1 - out_;
251 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
252 dynamically allocated string in TO-encoding. Any characters which cannot be
253 converted will be represented by '?'.
255 LENGTH should be the length of the string or -1, if null terminated.
257 The returned string will be allocated on POOL.
259 This function's behaviour differs from that of g_convert_with_fallback
260 provided by GLib. The GLib function will fail (returns NULL) if any part of
261 the input string is not valid in the declared input encoding. This function
262 however perseveres even in the presence of badly encoded input. */
264 recode_string_pool (const char *to, const char *from,
265 const char *text, int length, struct pool *pool)
267 struct substring out;
273 length = strlen (text);
275 out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
279 /* Returns the name of the encoding that should be used for file names.
281 This is meant to be the same encoding used by g_filename_from_uri() and
282 g_filename_to_uri() in GLib. */
284 filename_encoding (void)
286 #if defined _WIN32 || defined __WIN32__
289 return locale_charset ();
294 xconcat2 (const char *a, size_t a_len,
295 const char *b, size_t b_len)
297 char *s = xmalloc (a_len + b_len + 1);
298 memcpy (s, a, a_len);
299 memcpy (s + a_len, b, b_len);
300 s[a_len + b_len] = '\0';
304 /* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
305 TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
306 ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
307 it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
308 HEAD and tries again, repeating as necessary until the concatenated result
309 fits or until HEAD_LEN reaches 0.
311 [*] Actually this function drops grapheme clusters instead of characters, so
312 that, e.g. a Unicode character followed by a combining accent character
313 is either completely included or completely excluded from HEAD_LEN. See
314 UAX #29 at http://unicode.org/reports/tr29/ for more information on
317 A null ENCODING is treated as UTF-8.
319 Sometimes this function has to actually construct the concatenated string to
320 measure its length. When this happens, it sets *RESULTP to that
321 null-terminated string, allocated with malloc(), for the caller to use if it
322 needs it. Otherwise, it sets *RESULTP to NULL.
324 Simple examples for encoding="UTF-8", max_len=6:
326 head="abc", tail="xyz" => 3
327 head="abcd", tail="xyz" => 3 ("d" dropped).
328 head="abc", tail="uvwxyz" => 0 ("abc" dropped).
329 head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
331 Examples for encoding="ISO-8859-1", max_len=6:
333 head="éèä", tail="xyz" => 6
334 (each letter in head is only 1 byte in ISO-8859-1 even though they
335 each take 2 bytes in UTF-8 encoding)
338 utf8_encoding_concat__ (const char *head, size_t head_len,
339 const char *tail, size_t tail_len,
340 const char *encoding, size_t max_len,
346 else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
348 if (head_len + tail_len <= max_len)
350 else if (tail_len >= max_len)
360 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
362 ofs <= max_len - tail_len;
367 mblen = u8_mbtouc (&next,
368 CHAR_CAST (const uint8_t *, head + ofs),
370 if (uc_is_grapheme_break (prev, next))
383 result = (tail_len > 0
384 ? xconcat2 (head, head_len, tail, tail_len)
385 : CONST_CAST (char *, head));
386 if (recode_string_len (encoding, "UTF-8", result,
387 head_len + tail_len) <= max_len)
389 *resultp = result != head ? result : NULL;
394 bool correct_result = false;
401 for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
408 mblen = u8_mbtouc (&next,
409 CHAR_CAST (const uint8_t *, head + ofs),
411 if (uc_is_grapheme_break (prev, next))
415 memcpy (result, head, ofs);
416 memcpy (result + ofs, tail, tail_len);
417 result[ofs + tail_len] = '\0';
420 if (recode_string_len (encoding, "UTF-8", result,
421 ofs + tail_len) <= max_len)
423 correct_result = true;
427 correct_result = false;
446 /* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
447 null-terminated string owned by the caller. HEAD, TAIL, and the returned
448 string are all encoded in UTF-8. As many characters[*] from the beginning
449 of HEAD are included as will fit within MAX_LEN bytes supposing that the
450 resulting string were to be re-encoded in ENCODING. All of TAIL is always
451 included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
453 [*] Actually this function drops grapheme clusters instead of characters, so
454 that, e.g. a Unicode character followed by a combining accent character
455 is either completely included or completely excluded from the returned
456 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
457 information on grapheme clusters.
459 A null ENCODING is treated as UTF-8.
461 Simple examples for encoding="UTF-8", max_len=6:
463 head="abc", tail="xyz" => "abcxyz"
464 head="abcd", tail="xyz" => "abcxyz"
465 head="abc", tail="uvwxyz" => "uvwxyz"
466 head="abc", tail="tuvwxyz" => "tuvwxyz"
468 Examples for encoding="ISO-8859-1", max_len=6:
470 head="éèä", tail="xyz" => "éèäxyz"
471 (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
472 each take 2 bytes in UTF-8 encoding)
475 utf8_encoding_concat (const char *head, const char *tail,
476 const char *encoding, size_t max_len)
478 size_t tail_len = strlen (tail);
482 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
483 encoding, max_len, &result);
484 return (result != NULL
486 : xconcat2 (head, prefix_len, tail, tail_len));
489 /* Returns the length, in bytes, of the string that would be returned by
490 utf8_encoding_concat() if passed the same arguments, but the implementation
491 is often more efficient. */
493 utf8_encoding_concat_len (const char *head, const char *tail,
494 const char *encoding, size_t max_len)
496 size_t tail_len = strlen (tail);
500 prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
501 encoding, max_len, &result);
503 return prefix_len + tail_len;
506 /* Returns the number of display columns that would be occupied by the LENGTH
507 bytes of UTF-8 starting at S. */
509 utf8_count_columns (const char *s_, size_t length)
511 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
514 for (int ofs = 0; ofs < length; )
517 ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
520 int width = uc_width (uc, "UTF-8");
525 columns = ROUND_UP (columns + 1, 8);
530 /* Returns the byte offset in LENGTH-byte UTF-8 string S that is N_COLUMNS
531 display columns into the string. */
533 utf8_columns_to_bytes (const char *s_, size_t length, size_t n_columns)
535 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
539 for (ofs = 0; ofs < length && columns < n_columns; )
542 ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
545 int width = uc_width (uc, "UTF-8");
550 columns = ROUND_UP (columns + 1, 8);
555 /* Returns an allocated, null-terminated string, owned by the caller,
556 containing as many characters[*] from the beginning of S that would fit
557 within MAX_LEN bytes if the returned string were to be re-encoded in
558 ENCODING. Both S and the returned string are encoded in UTF-8.
560 [*] Actually this function drops grapheme clusters instead of characters, so
561 that, e.g. a Unicode character followed by a combining accent character
562 is either completely included or completely excluded from the returned
563 string. See UAX #29 at http://unicode.org/reports/tr29/ for more
564 information on grapheme clusters.
566 A null ENCODING is treated as UTF-8.
569 utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
571 return utf8_encoding_concat (s, "", encoding, max_len);
574 /* Returns the length, in bytes, of the string that would be returned by
575 utf8_encoding_trunc() if passed the same arguments, but the implementation
576 is often more efficient. */
578 utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
580 return utf8_encoding_concat_len (s, "", encoding, max_len);
583 /* Returns FILENAME converted from UTF-8 to the filename encoding.
584 On Windows the filename encoding is UTF-8; elsewhere it is based on the
587 utf8_to_filename (const char *filename)
589 return recode_string (filename_encoding (), "UTF-8", filename, -1);
592 /* Returns FILENAME converted from the filename encoding to UTF-8.
593 On Windows the filename encoding is UTF-8; elsewhere it is based on the
596 filename_to_utf8 (const char *filename)
598 return recode_string ("UTF-8", filename_encoding (), filename, -1);
602 recode_substring_pool__ (const char *to, const char *from,
603 struct substring text, char fallbackchar,
604 struct pool *pool, struct substring *out)
607 struct converter *conv;
610 to = default_encoding;
613 from = default_encoding;
615 conv = create_iconv (to, from, true);
621 out->string = pool_malloc (pool, text.length + 1);
622 out->length = text.length;
623 memcpy (out->string, text.string, text.length);
624 out->string[out->length] = '\0';
631 for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
633 char *output = pool_malloc (pool, bufsize);
636 retval = try_recode (conv, fallbackchar, text.string, text.length,
640 *out = ss_buffer (output, retval);
643 pool_free (pool, output);
645 if (retval != -E2BIG)
652 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
653 dynamically allocated string in TO-encoding. Any characters which cannot be
654 converted will be represented by '?'.
656 The returned string will be null-terminated and allocated on POOL with
659 This function's behaviour differs from that of g_convert_with_fallback
660 provided by GLib. The GLib function will fail (returns NULL) if any part of
661 the input string is not valid in the declared input encoding. This function
662 however perseveres even in the presence of badly encoded input. */
664 recode_substring_pool (const char *to, const char *from,
665 struct substring text, struct pool *pool)
667 struct substring out;
669 recode_substring_pool__ (to, from, text, '?', pool, &out);
673 /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
674 dynamically allocated string in TO-encoding. On success, returns 0, and the
675 converted null-terminated string, allocated from POOL with pool_malloc(), is
676 stored in *OUT. On failure, returns a positive errno value.
678 The function fails with an error if any part of the input string is not
679 valid in the declared input encoding. */
681 recode_pedantically (const char *to, const char *from,
682 struct substring text, struct pool *pool,
683 struct substring *out)
687 error = recode_substring_pool__ (to, from, text, 0, pool, out);
696 setlocale (LC_ALL, "");
698 bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
700 textdomain (PACKAGE);
702 assert (default_encoding == NULL);
703 default_encoding = xstrdup (locale_charset ());
709 get_default_encoding (void)
711 return default_encoding;
715 set_default_encoding (const char *enc)
717 free (default_encoding);
718 default_encoding = xstrdup (enc);
721 /* Return the ISO two letter code for the current LC_MESSAGES
726 const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
727 if (0 == strcmp (localename, "C"))
729 char *ln = xstrdup (localename);
730 char *end = strchr (ln, '_');
737 /* Attempts to set the encoding from a locale name
738 returns true if successful.
739 This function does not (should not!) alter the current locale.
742 set_encoding_from_locale (const char *loc)
747 char *tmp = xstrdup (setlocale (LC_CTYPE, NULL));
749 setlocale (LC_CTYPE, "C");
750 c_encoding = xstrdup (locale_charset ());
752 setlocale (LC_CTYPE, loc);
753 loc_encoding = xstrdup (locale_charset ());
756 if (0 == strcmp (loc_encoding, c_encoding))
761 setlocale (LC_CTYPE, tmp);
767 free (default_encoding);
768 default_encoding = loc_encoding;
781 struct hmapx_node *node;
782 struct converter *cvtr;
784 HMAPX_FOR_EACH (cvtr, node, &map)
789 free (cvtr->fromcode);
790 if (cvtr->conv != (iconv_t) -1)
791 iconv_close (cvtr->conv);
795 hmapx_destroy (&map);
797 free (default_encoding);
798 default_encoding = NULL;
804 valid_encoding (const char *enc)
806 iconv_t conv = iconv_open (UTF8, enc);
808 if (conv == (iconv_t) -1)
817 /* Return the system local's idea of the
818 decimal separator character */
820 get_system_decimal (void)
825 radix_char = nl_langinfo (RADIXCHAR)[0];
829 snprintf (buf, sizeof buf, "%f", 2.5);
838 uc_name (ucs4_t uc, char buffer[16])
840 if (uc >= 0x20 && uc < 0x7f)
841 snprintf (buffer, 16, "`%c'", uc);
843 snprintf (buffer, 16, "U+%04X", uc);
847 /* UTF-8 functions that deal with uppercase/lowercase distinctions. */
849 /* Returns a hash value for the N bytes of UTF-8 encoded data starting at S,
850 with lowercase and uppercase letters treated as equal, starting from
853 utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis)
855 uint8_t folded_buf[2048];
856 size_t folded_len = sizeof folded_buf;
860 folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n,
861 NULL, UNINORM_NFKD, folded_buf, &folded_len);
862 if (folded_s != NULL)
864 hash = hash_bytes (folded_s, folded_len, basis);
865 if (folded_s != folded_buf)
872 hash = hash_bytes (s, n, basis);
878 /* Returns a hash value for null-terminated UTF-8 string S, with lowercase and
879 uppercase letters treated as equal, starting from BASIS. */
881 utf8_hash_case_string (const char *s, unsigned int basis)
883 return utf8_hash_case_substring (ss_cstr (s), basis);
886 /* Returns a hash value for UTF-8 string S, with lowercase and uppercase
887 letters treated as equal, starting from BASIS. */
889 utf8_hash_case_substring (struct substring s, unsigned int basis)
891 return utf8_hash_case_bytes (s.string, s.length, basis);
894 /* Compares UTF-8 strings A and B case-insensitively.
895 Returns a negative value if A < B, zero if A == B, positive if A > B. */
897 utf8_strcasecmp (const char *a, const char *b)
899 return utf8_sscasecmp (ss_cstr (a), ss_cstr (b));
903 utf8_sscasecmp (struct substring a, struct substring b)
905 return utf8_strncasecmp (a.string, a.length, b.string, b.length);
908 /* Compares UTF-8 strings A (with length AN) and B (with length BN)
910 Returns a negative value if A < B, zero if A == B, positive if A > B. */
912 utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn)
916 if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an,
917 CHAR_CAST (const uint8_t *, b), bn,
918 NULL, UNINORM_NFKD, &result))
923 result = memcmp (a, b, MIN (an, bn));
925 result = an < bn ? -1 : an > bn;
932 is_all_digits (const uint8_t *s, size_t len)
934 for (size_t i = 0; i < len; i++)
935 if (!c_isdigit (s[i]))
940 /* Compares UTF-8 strings A and B case-insensitively. If the strings end in a
941 number, then they are compared numerically. Returns a negative value if A <
942 B, zero if A == B, positive if A > B. */
944 utf8_strverscasecmp (const char *a, const char *b)
948 size_t a_len = sizeof a_stub;
949 uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL,
950 UNINORM_NFKD, a_stub, &a_len);
954 size_t b_len = sizeof b_stub;
955 uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL,
956 UNINORM_NFKD, b_stub, &b_len);
959 if (!a_norm || !b_norm)
961 result = strcmp (a, b);
965 size_t len = MIN (a_len, b_len);
966 for (size_t i = 0; i < len; i++)
967 if (a_norm[i] != b_norm[i])
969 /* If both strings end in digits, compare them numerically. */
970 if (is_all_digits (&a_norm[i], a_len - i)
971 && is_all_digits (&b_norm[i], b_len - i))
973 /* Start by stripping leading zeros, since those don't matter for
974 numerical comparison. */
976 for (ap = i; ap < a_len; ap++)
977 if (a_norm[ap] != '0')
979 for (bp = i; bp < b_len; bp++)
980 if (b_norm[bp] != '0')
983 /* The number with more digits, if there is one, is larger. */
984 size_t a_digits = a_len - ap;
985 size_t b_digits = b_len - bp;
986 if (a_digits != b_digits)
987 result = a_digits > b_digits ? 1 : -1;
989 result = memcmp (&a_norm[ap], &b_norm[bp], a_digits);
992 result = a_norm[i] > b_norm[i] ? 1 : -1;
995 result = a_len < b_len ? -1 : a_len > b_len;
998 if (a_norm != a_stub)
1000 if (b_norm != b_stub)
1006 utf8_casemap (const char *s,
1007 uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t,
1008 uint8_t *, size_t *))
1013 result = CHAR_CAST (char *,
1014 f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1,
1015 NULL, NULL, NULL, &size));
1018 if (errno == ENOMEM)
1021 result = xstrdup (s);
1027 utf8_to_upper (const char *s)
1029 return utf8_casemap (s, u8_toupper);
1033 utf8_to_lower (const char *s)
1035 return utf8_casemap (s, u8_tolower);
1039 utf8_to_title (const char *s)
1041 return utf8_casemap (s, u8_totitle);
1045 get_encoding_info (struct encoding_info *e, const char *name)
1047 const struct substring in = SS_LITERAL_INITIALIZER (
1049 "!\"#$%&'()*+,-./0123456789:;<=>?@"
1050 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
1051 "abcdefghijklmnopqrstuvwxyz{|}~");
1053 struct substring out, cr, lf, space;
1056 memset (e, 0, sizeof *e);
1058 cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
1059 lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
1060 space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
1061 ok = (cr.length >= 1
1062 && cr.length <= MAX_UNIT
1063 && cr.length == lf.length
1064 && cr.length == space.length);
1067 fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
1070 ss_dealloc (&space);
1071 ss_alloc_substring (&cr, ss_cstr ("\r"));
1072 ss_alloc_substring (&lf, ss_cstr ("\n"));
1073 ss_alloc_substring (&space, ss_cstr (" "));
1076 e->unit = cr.length;
1077 memcpy (e->cr, cr.string, e->unit);
1078 memcpy (e->lf, lf.string, e->unit);
1079 memcpy (e->space, space.string, e->unit);
1083 ss_dealloc (&space);
1085 out = recode_substring_pool ("UTF-8", name, in, NULL);
1086 e->is_ascii_compatible = ss_equals (in, out);
1089 if (!e->is_ascii_compatible && e->unit == 1)
1091 out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
1092 e->is_ebcdic_compatible = (out.length == 1
1093 && (uint8_t) out.string[0] == 0xc1);
1097 e->is_ebcdic_compatible = false;
1103 is_encoding_ascii_compatible (const char *encoding)
1105 struct encoding_info e;
1107 get_encoding_info (&e, encoding);
1108 return e.is_ascii_compatible;
1112 is_encoding_ebcdic_compatible (const char *encoding)
1114 struct encoding_info e;
1116 get_encoding_info (&e, encoding);
1117 return e.is_ebcdic_compatible;
1120 /* Returns true if iconv can convert ENCODING to and from UTF-8,
1123 is_encoding_supported (const char *encoding)
1125 return (create_iconv ("UTF-8", encoding, false)
1126 && create_iconv (encoding, "UTF-8", false));
1129 /* Returns true if E is the name of a UTF-8 encoding.
1131 XXX Possibly we should test not E as a string but its properties via
1134 is_encoding_utf8 (const char *e)
1136 return ((e[0] == 'u' || e[0] == 'U')
1137 && (e[1] == 't' || e[1] == 'T')
1138 && (e[2] == 'f' || e[2] == 'F')
1139 && ((e[3] == '8' && e[4] == '\0')
1140 || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
1143 static struct encoding_category *categories;
1144 static int n_categories;
1146 static void SENTINEL (0)
1147 add_category (size_t *allocated_categories, const char *category, ...)
1149 struct encoding_category *c;
1150 const char *encodings[16];
1154 /* Count encoding arguments. */
1155 va_start (args, category);
1157 while ((encodings[n] = va_arg (args, const char *)) != NULL)
1159 const char *encoding = encodings[n];
1160 if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding))
1163 assert (n < sizeof encodings / sizeof *encodings);
1169 if (n_categories >= *allocated_categories)
1170 categories = x2nrealloc (categories,
1171 allocated_categories, sizeof *categories);
1173 c = &categories[n_categories++];
1174 c->category = category;
1175 c->encodings = xmalloc (n * sizeof *c->encodings);
1176 for (i = 0; i < n; i++)
1177 c->encodings[i] = encodings[i];
1182 init_encoding_categories (void)
1192 add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1193 "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL);
1194 add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256",
1196 add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL);
1197 add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4",
1198 "Windows-1257", NULL_SENTINEL);
1199 add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL);
1200 add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2",
1201 "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL);
1202 add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK",
1203 "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL);
1204 add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS",
1205 "EUC-TW", NULL_SENTINEL);
1206 add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL);
1207 add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111",
1208 "KOI8-R", "MacCyrillic", NULL_SENTINEL);
1209 add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL);
1210 add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian",
1212 add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL);
1213 add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL);
1214 add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL);
1215 add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL);
1216 add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255",
1218 add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL);
1219 add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL);
1220 add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL);
1221 add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS",
1223 add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC",
1225 add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL);
1226 add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian",
1228 add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL);
1229 add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874",
1231 add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254",
1233 add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS",
1234 "Windows-1258", NULL_SENTINEL);
1235 add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15",
1236 "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL);
1239 /* Returns an array of "struct encoding_category" that contains only the
1240 categories and encodings that the system supports. */
1241 struct encoding_category *
1242 get_encoding_categories (void)
1244 init_encoding_categories ();
1248 /* Returns the number of elements in the array returned by
1249 get_encoding_categories(). */
1251 get_n_encoding_categories (void)
1253 init_encoding_categories ();
1254 return n_categories;