unistd \
unictype/property-id-continue \
unictype/property-id-start \
+ unigbrk/uc-is-grapheme-break \
unistr/u8-cpy \
unistr/u8-mbtouc \
unistr/u8-strlen \
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <unigbrk.h>
#include "libpspp/assertion.h"
#include "libpspp/hmapx.h"
#include "libpspp/str.h"
#include "libpspp/version.h"
+#include "gl/c-strcase.h"
#include "gl/localcharset.h"
#include "gl/xalloc.h"
#include "gl/relocatable.h"
return out.string;
}
+/* Returns the name of the encoding that should be used for file names.
+
+ This is meant to be the same encoding used by g_filename_from_uri() and
+ g_filename_to_uri() in GLib. */
+static const char *
+filename_encoding (void)
+{
+#if defined _WIN32 || defined __WIN32__
+ return "UTF-8";
+#else
+ return locale_charset ();
+#endif
+}
+
+static char *
+xconcat2 (const char *a, size_t a_len,
+ const char *b, size_t b_len)
+{
+ char *s = xmalloc (a_len + b_len + 1);
+ memcpy (s, a, a_len);
+ memcpy (s + a_len, b, b_len);
+ s[a_len + b_len] = '\0';
+ return s;
+}
+
+/* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
+ TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
+ ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
+ it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
+ HEAD and tries again, repeating as necessary until the concatenated result
+ fits or until HEAD_LEN reaches 0.
+
+ [*] Actually this function drops grapheme clusters instead of characters, so
+ that, e.g. a Unicode character followed by a combining accent character
+ is either completely included or completely excluded from HEAD_LEN. See
+ UAX #29 at http://unicode.org/reports/tr29/ for more information on
+ grapheme clusters.
+
+ A null ENCODING is treated as UTF-8.
+
+ Sometimes this function has to actually construct the concatenated string to
+ measure its length. When this happens, it sets *RESULTP to that
+ null-terminated string, allocated with malloc(), for the caller to use if it
+ needs it. Otherwise, it sets *RESULTP to NULL.
+
+ Simple examples for encoding="UTF-8", max_len=6:
+
+ head="abc", tail="xyz" => 3
+ head="abcd", tail="xyz" => 3 ("d" dropped).
+ head="abc", tail="uvwxyz" => 0 ("abc" dropped).
+ head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
+
+ Examples for encoding="ISO-8859-1", max_len=6:
+
+ head="éèä", tail="xyz" => 6
+ (each letter in head is only 1 byte in ISO-8859-1 even though they
+ each take 2 bytes in UTF-8 encoding)
+*/
+static size_t
+utf8_encoding_concat__ (const char *head, size_t head_len,
+ const char *tail, size_t tail_len,
+ const char *encoding, size_t max_len,
+ char **resultp)
+{
+ *resultp = NULL;
+ if (head_len == 0)
+ return 0;
+ else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
+ {
+ if (head_len + tail_len <= max_len)
+ return head_len;
+ else if (tail_len >= max_len)
+ return 0;
+ else
+ {
+ size_t copy_len;
+ size_t prev;
+ size_t ofs;
+ int mblen;
+
+ copy_len = 0;
+ for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
+ head_len);
+ ofs <= max_len - tail_len;
+ ofs += mblen)
+ {
+ ucs4_t next;
+
+ mblen = u8_mbtouc (&next,
+ CHAR_CAST (const uint8_t *, head + ofs),
+ head_len - ofs);
+ if (uc_is_grapheme_break (prev, next))
+ copy_len = ofs;
+
+ prev = next;
+ }
+
+ return copy_len;
+ }
+ }
+ else
+ {
+ char *result;
+
+ result = (tail_len > 0
+ ? xconcat2 (head, head_len, tail, tail_len)
+ : CONST_CAST (char *, head));
+ if (recode_string_len (encoding, "UTF-8", result,
+ head_len + tail_len) <= max_len)
+ {
+ *resultp = result != head ? result : NULL;
+ return head_len;
+ }
+ else
+ {
+ bool correct_result = false;
+ size_t copy_len;
+ size_t prev;
+ size_t ofs;
+ int mblen;
+
+ copy_len = 0;
+ for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
+ head_len);
+ ofs <= head_len;
+ ofs += mblen)
+ {
+ ucs4_t next;
+
+ mblen = u8_mbtouc (&next,
+ CHAR_CAST (const uint8_t *, head + ofs),
+ head_len - ofs);
+ if (uc_is_grapheme_break (prev, next))
+ {
+ if (result != head)
+ {
+ memcpy (result, head, ofs);
+ memcpy (result + ofs, tail, tail_len);
+ result[ofs + tail_len] = '\0';
+ }
+
+ if (recode_string_len (encoding, "UTF-8", result,
+ ofs + tail_len) <= max_len)
+ {
+ correct_result = true;
+ copy_len = ofs;
+ }
+ else
+ correct_result = false;
+ }
+
+ prev = next;
+ }
+
+ if (result != head)
+ {
+ if (correct_result)
+ *resultp = result;
+ else
+ free (result);
+ }
+
+ return copy_len;
+ }
+ }
+}
+
+/* Concatenates a prefix of HEAD with all of TAIL and returns the result as a
+ null-terminated string owned by the caller. HEAD, TAIL, and the returned
+ string are all encoded in UTF-8. As many characters[*] from the beginning
+ of HEAD are included as will fit within MAX_LEN bytes supposing that the
+ resulting string were to be re-encoded in ENCODING. All of TAIL is always
+ included, even if TAIL by itself is longer than MAX_LEN in ENCODING.
+
+ [*] Actually this function drops grapheme clusters instead of characters, so
+ that, e.g. a Unicode character followed by a combining accent character
+ is either completely included or completely excluded from the returned
+ string. See UAX #29 at http://unicode.org/reports/tr29/ for more
+ information on grapheme clusters.
+
+ A null ENCODING is treated as UTF-8.
+
+ Simple examples for encoding="UTF-8", max_len=6:
+
+ head="abc", tail="xyz" => "abcxyz"
+ head="abcd", tail="xyz" => "abcxyz"
+ head="abc", tail="uvwxyz" => "uvwxyz"
+ head="abc", tail="tuvwxyz" => "tuvwxyz"
+
+ Examples for encoding="ISO-8859-1", max_len=6:
+
+ head="éèä", tail="xyz" => "éèäxyz"
+ (each letter in HEAD is only 1 byte in ISO-8859-1 even though they
+ each take 2 bytes in UTF-8 encoding)
+*/
+char *
+utf8_encoding_concat (const char *head, const char *tail,
+ const char *encoding, size_t max_len)
+{
+ size_t tail_len = strlen (tail);
+ size_t prefix_len;
+ char *result;
+
+ prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
+ encoding, max_len, &result);
+ return (result != NULL
+ ? result
+ : xconcat2 (head, prefix_len, tail, tail_len));
+}
+
+/* Returns the length, in bytes, of the string that would be returned by
+ utf8_encoding_concat() if passed the same arguments, but the implementation
+ is often more efficient. */
+size_t
+utf8_encoding_concat_len (const char *head, const char *tail,
+ const char *encoding, size_t max_len)
+{
+ size_t tail_len = strlen (tail);
+ size_t prefix_len;
+ char *result;
+
+ prefix_len = utf8_encoding_concat__ (head, strlen (head), tail, tail_len,
+ encoding, max_len, &result);
+ free (result);
+ return prefix_len + tail_len;
+}
+
+/* Returns an allocated, null-terminated string, owned by the caller,
+ containing as many characters[*] from the beginning of S that would fit
+ within MAX_LEN bytes if the returned string were to be re-encoded in
+ ENCODING. Both S and the returned string are encoded in UTF-8.
+
+ [*] Actually this function drops grapheme clusters instead of characters, so
+ that, e.g. a Unicode character followed by a combining accent character
+ is either completely included or completely excluded from the returned
+ string. See UAX #29 at http://unicode.org/reports/tr29/ for more
+ information on grapheme clusters.
+
+ A null ENCODING is treated as UTF-8.
+*/
+char *
+utf8_encoding_trunc (const char *s, const char *encoding, size_t max_len)
+{
+ return utf8_encoding_concat (s, "", encoding, max_len);
+}
+
+/* Returns the length, in bytes, of the string that would be returned by
+ utf8_encoding_trunc() if passed the same arguments, but the implementation
+ is often more efficient. */
+size_t
+utf8_encoding_trunc_len (const char *s, const char *encoding, size_t max_len)
+{
+ return utf8_encoding_concat_len (s, "", encoding, max_len);
+}
+
+/* Returns FILENAME converted from UTF-8 to the filename encoding.
+ On Windows the filename encoding is UTF-8; elsewhere it is based on the
+ current locale. */
+char *
+utf8_to_filename (const char *filename)
+{
+ return recode_string (filename_encoding (), "UTF-8", filename, -1);
+}
+
+/* Returns FILENAME converted from the filename encoding to UTF-8.
+ On Windows the filename encoding is UTF-8; elsewhere it is based on the
+ current locale. */
+char *
+filename_to_utf8 (const char *filename)
+{
+ return recode_string ("UTF-8", filename_encoding (), filename, -1);
+}
+
/* Converts the string TEXT, which should be encoded in FROM-encoding, to a
dynamically allocated string in TO-encoding. Any characters which cannot be
converted will be represented by '?'.
hmapx_init (&map);
}
-
const char *
get_default_encoding (void)
{
size_t recode_string_len (const char *to, const char *from,
const char *text, int len);
+char *utf8_encoding_trunc (const char *, const char *encoding,
+ size_t max_len);
+size_t utf8_encoding_trunc_len (const char *, const char *encoding,
+ size_t max_len);
+
+char *utf8_encoding_concat (const char *head, const char *tail,
+ const char *encoding, size_t max_len);
+size_t utf8_encoding_concat_len (const char *head, const char *tail,
+ const char *encoding, size_t max_len);
+
+char *utf8_to_filename (const char *filename);
+char *filename_to_utf8 (const char *filename);
+
bool valid_encoding (const char *enc);
char get_system_decimal (void);
int
main (int argc, char *argv[])
{
- char *s;
+ i18n_init ();
+
+ if (argc == 5 && !strcmp (argv[1], "recode"))
+ {
+ const char *from = argv[2];
+ const char *to = argv[3];
+ const char *string = argv[4];
+ char *result = recode_string (to, from, string, -1);
+ puts (result);
+ assert (strlen (result) == recode_string_len (to, from, string, -1));
+ free (result);
+ }
+ else if (argc == 6 && !strcmp (argv[1], "concat"))
+ {
+ const char *head = argv[2];
+ const char *tail = argv[3];
+ const char *encoding = argv[4];
+ int max_len = atoi (argv[5]);
+ char *result;
- if (argc != 4)
+ result = utf8_encoding_concat (head, tail, encoding, max_len);
+ puts (result);
+
+ assert (strlen (result)
+ == utf8_encoding_concat_len (head, tail, encoding, max_len));
+
+ if (tail[0] == '\0')
+ {
+ char *result2 = utf8_encoding_trunc (head, encoding, max_len);
+ assert (!strcmp (result, result2));
+ assert (strlen (result2)
+ == utf8_encoding_trunc_len (head, encoding, max_len));
+ free (result2);
+ }
+
+ free (result);
+ }
+ else
{
- fprintf (stderr,
- "usage: %s FROM TO STRING\n"
- "where FROM is the source encoding,\n"
- " TO is the target encoding,\n"
- " and STRING is the text to recode.\n",
- argv[0]);
+ fprintf (stderr, "\
+usage: %s recode FROM TO STRING\n\
+where FROM is the source encoding,\n\
+ TO is the target encoding,\n\
+ and STRING is the text to recode.\n\
+\n\
+usage: %s concat HEAD TAIL ENCODING MAX_LEN\n\
+where HEAD is the first string to concatenate\n\
+ TAIL is the second string to concatenate\n\
+ ENCODING is the encoding in which to measure the result's length\n\
+ MAX_LEN is the maximum length of the result in ENCODING.\n",
+ argv[0], argv[0]);
return EXIT_FAILURE;
}
- i18n_init ();
- s = recode_string (argv[2], argv[1], argv[3], -1);
- puts (s);
- assert (strlen (s) == recode_string_len (argv[2], argv[1], argv[3], -1));
- free (s);
+ i18n_done ();
return 0;
}
-AT_BANNER([i18n routines])
+AT_BANNER([i18n recoding])
-# CHECK_I18N([TITLE], [FROM-CODING], [TO-CODING], [FROM-TEXT], [TO-TEXT])
+m4_divert_push([PREPARE_TESTS])
+supports_encodings () {
+ case "$host" in
+ *-*-linux* | *-*-*-gnu*)
+ dnl GNU/Linux always has the encodings we want. We can't ask
+ dnl config.charset about them because it has a special case here
+ dnl too and won't tell us.
+ return 0
+ ;;
+ *)
+ for encoding in "$@"; do
+ $SHELL $top_srcdir/gl/config.charset "$host" | grep '$2' || return 77
+ done
+ ;;
+ esac
+}
+m4_divert_pop([PREPARE_TESTS])
+
+# CHECK_I18N_RECODE([TITLE], [FROM-CODING], [TO-CODING],
+# [FROM-TEXT], [TO-TEXT])
#
# Converts FROM-TEXT from FROM-CODING to TO-CODING and checks that the result
-# is TO-TEXT. The "printf" program is applied to both FROM-TEXT and TO-TEXT
-# to allow for backslash-escapes. (Be aware that hex escapes are not portable;
-# use octal escapes instead.)
-m4_define([CHECK_I18N],
+# is TO-TEXT. The "printf" program is applied to both FROM-TEXT and TO-TEXT to
+# allow for backslash-escapes. (Hex escapes are not portable; use octal
+# escapes instead.)
+m4_define([CHECK_I18N_RECODE],
[AT_SETUP([convert $1])
AT_KEYWORDS([i18n])
dnl Skip the test if this host doesn't know the source and target encodings.
- AT_CHECK(
- [case "$host" in
- *-*-linux* | *-*-*-gnu*)
- dnl GNU/Linux always has the encodings we want. We can't ask
- dnl config.charset about them because it has a special case here
- dnl too and won't tell us.
- ;;
- *)
- $SHELL $top_srcdir/gl/config.charset "$host" | grep '$2' || exit 77
- $SHELL $top_srcdir/gl/config.charset "$host" | grep '$3' || exit 77
- ;;
- esac
- ], [0], [ignore])
- AT_CHECK_UNQUOTED([i18n-test '$2' '$3' `printf '$4'`], [0], [`printf '$5'`
+ AT_CHECK([supports_encodings '$2' '$3'])
+ AT_CHECK_UNQUOTED([i18n-test recode '$2' '$3' `printf '$4'`], [0], [`printf '$5'`
])
AT_CLEANUP])
-CHECK_I18N([reflexively], [ASCII], [ASCII], [abc], [abc])
-CHECK_I18N([without any change], [ASCII], [UTF-8], [abc], [abc])
+CHECK_I18N_RECODE([reflexively], [ASCII], [ASCII], [abc], [abc])
+CHECK_I18N_RECODE([without any change], [ASCII], [UTF-8], [abc], [abc])
-CHECK_I18N([from ISO-8859-1 to UTF-8], [ISO-8859-1], [UTF-8],
- [\242], [\302\242])
-CHECK_I18N([from UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
- [\302\242], [\242])
+CHECK_I18N_RECODE([from ISO-8859-1 to UTF-8], [ISO-8859-1], [UTF-8],
+ [\242], [\302\242])
+CHECK_I18N_RECODE([from UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
+ [\302\242], [\242])
# 0xc0 == 0300 is invalid in UTF-8
-CHECK_I18N([invalid UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
- [xy\300z], [xy?z])
+CHECK_I18N_RECODE([invalid UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
+ [xy\300z], [xy?z])
# 0xc2 == 0302 is the first byte of a 2-byte UTF-8 sequence
-CHECK_I18N([truncated UTF-8 to ISO-8559-1], [UTF-8], [ISO-8859-1],
- [xy\302], [xy?])
+CHECK_I18N_RECODE([truncated UTF-8 to ISO-8559-1], [UTF-8], [ISO-8859-1],
+ [xy\302], [xy?])
dnl The input to this test is 7 bytes long and the expected output is 9 bytes.
dnl So it should exercise the E2BIG case
-CHECK_I18N([from ISO-8859-1 to UTF-8 with overflow], [ISO-8859-1], [UTF-8],
- [Tsch\374\337!], [Tsch\303\274\303\237!])
+CHECK_I18N_RECODE([from ISO-8859-1 to UTF-8 with overflow],
+ [ISO-8859-1], [UTF-8],
+ [Tsch\374\337!], [Tsch\303\274\303\237!])
+\f
+AT_BANNER([i18n concatenation])
+
+# CHECK_I18N_CONCAT([HEAD], [TAIL], [ENCODING], [MAX-LEN], [ANSWER])
+#
+# Concatenates HEAD and TAIL, omitting as many characters from HEAD as needed
+# to make the result come out to no more than MAX-LEN bytes if it was expressed
+# in ENCODING, and checks that the answer matches ANSWER. HEAD, TAIL, and
+# ANSWER are all in UTF-8. The "printf" program is applied to HEAD, TAIL, and
+# ANSWER to allow for backslash-escapes. (Hex escapes are not portable; use
+# octal escapes instead.)
+m4_define([CHECK_I18N_CONCAT],
+ [AT_SETUP([m4_if([$2], [], [truncate "$1" to $4 bytes in $3],
+ [truncate "$1" + "$2" to $4 bytes in $3])])
+ AT_KEYWORDS([i18n])
+
+ dnl Skip the test if this host doesn't know the encoding.
+ AT_CHECK([supports_encodings '$3'])
+ AT_CHECK_UNQUOTED(
+ [i18n-test concat "`printf '$1'`" "`printf '$2'`" '$3' '$4'], [0],
+ [`printf '$5'`
+])
+ AT_CLEANUP])
+
+CHECK_I18N_CONCAT([abc], [], [UTF-8], [6], [abc])
+CHECK_I18N_CONCAT([], [xyz], [UTF-8], [6], [xyz])
+CHECK_I18N_CONCAT([], [], [UTF-8], [6], [])
+CHECK_I18N_CONCAT([abcdefghij], [], [UTF-8], [6], [abcdef])
+CHECK_I18N_CONCAT([], [tuvwxyz], [UTF-8], [6], [tuvwxyz])
+
+CHECK_I18N_CONCAT([abc], [xyz], [UTF-8], [6], [abcxyz])
+CHECK_I18N_CONCAT([abcd], [xyz], [UTF-8], [6], [abcxyz])
+CHECK_I18N_CONCAT([abc], [uvwxyz], [UTF-8], [6], [uvwxyz])
+
+# x in a box ( x⃞ ) is U+0078, U+20DE, 4 bytes in UTF-8, and one grapheme
+# cluster.
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [0], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [1], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [2], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [3], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [4], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [5], [x\342\203\236y])
+# éèä is only 3 bytes in ISO-8859-1.
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [0], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [1], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [2], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [3], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [4],
+ [\303\251xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [5],
+ [\303\251\303\250xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [6],
+ [\303\251\303\250\303\244xyz])