+#include <unigbrk.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/hmapx.h"
+#include "libpspp/hash-functions.h"
+#include "libpspp/pool.h"
+#include "libpspp/str.h"
+#include "libpspp/version.h"
+
+#include "gl/c-strcase.h"
+#include "gl/localcharset.h"
+#include "gl/xalloc.h"
+#include "gl/relocatable.h"
+#include "gl/xstrndup.h"
+
+struct converter
+ {
+ char *tocode;
+ char *fromcode;
+ iconv_t conv;
+ };
+
+static char *default_encoding;
+static struct hmapx map;
+
+/* A wrapper around iconv_open */
+static iconv_t
+create_iconv (const char* tocode, const char* fromcode)
+{
+ size_t hash;
+ struct hmapx_node *node;
+ struct converter *converter;
+ assert (fromcode);
+
+ hash = hash_string (tocode, hash_string (fromcode, 0));
+ HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
+ if (!strcmp (tocode, converter->tocode)
+ && !strcmp (fromcode, converter->fromcode))
+ return converter->conv;
+
+ converter = xmalloc (sizeof *converter);
+ converter->tocode = xstrdup (tocode);
+ converter->fromcode = xstrdup (fromcode);
+ converter->conv = iconv_open (tocode, fromcode);
+ hmapx_insert (&map, converter, hash);
+
+ /* I don't think it's safe to translate this string or to use messaging
+ as the converters have not yet been set up */
+ if ( (iconv_t) -1 == converter->conv && 0 != strcmp (tocode, fromcode))
+ {
+ const int err = errno;
+ fprintf (stderr,
+ "Warning: "
+ "cannot create a converter for `%s' to `%s': %s\n",
+ fromcode, tocode, strerror (err));
+ }
+
+ return converter->conv;
+}
+
+/* Converts the single byte C from encoding FROM to TO, returning the first
+ byte of the result.
+
+ This function probably shouldn't be used at all, but some code still does
+ use it. */
+char
+recode_byte (const char *to, const char *from, char c)
+{
+ char x;
+ char *s = recode_string (to, from, &c, 1);
+ x = s[0];
+ free (s);
+ return x;
+}
+
+/* Similar to recode_string_pool, but allocates the returned value on the heap
+ instead of in a pool. It is the caller's responsibility to free the
+ returned value. */
+char *
+recode_string (const char *to, const char *from,
+ const char *text, int length)
+{
+ return recode_string_pool (to, from, text, length, NULL);
+}
+
+/* Returns the length, in bytes, of the string that a similar recode_string()
+ call would return. */
+size_t
+recode_string_len (const char *to, const char *from,
+ const char *text, int length)
+{
+ char *s = recode_string (to, from, text, length);
+ size_t len = strlen (s);
+ free (s);
+ return len;
+}
+
+/* Uses CONV to convert the INBYTES starting at IP into the OUTBYTES starting
+ at OP, and appends a null terminator to the output.
+
+ Returns the output length if successful, -1 if the output buffer is too
+ small. */
+static ssize_t
+try_recode (iconv_t conv,
+ const char *ip, size_t inbytes,
+ char *op_, size_t outbytes)
+{
+ /* FIXME: Need to ensure that this char is valid in the target encoding */
+ const char fallbackchar = '?';
+ char *op = op_;
+
+ /* Put the converter into the initial shift state, in case there was any
+ state information left over from its last usage. */
+ iconv (conv, NULL, 0, NULL, 0);
+
+ while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes,
+ &op, &outbytes) == -1)
+ switch (errno)
+ {
+ case EINVAL:
+ if (outbytes < 2)
+ return -1;
+ *op++ = fallbackchar;
+ *op = '\0';
+ return op - op_;
+
+ case EILSEQ:
+ if (outbytes == 0)
+ return -1;
+ *op++ = fallbackchar;
+ outbytes--;
+ ip++;
+ inbytes--;
+ break;
+
+ case E2BIG:
+ return -1;
+
+ default:
+ /* should never happen */
+ fprintf (stderr, "Character conversion error: %s\n", strerror (errno));
+ NOT_REACHED ();
+ break;
+ }
+
+ if (outbytes == 0)
+ return -1;
+
+ *op = '\0';
+ return op - op_;
+}
+
+/* Converts the string TEXT, which should be encoded in FROM-encoding, to a
+ dynamically allocated string in TO-encoding. Any characters which cannot be
+ converted will be represented by '?'.
+
+ LENGTH should be the length of the string or -1, if null terminated.
+
+ The returned string will be allocated on POOL.
+
+ This function's behaviour differs from that of g_convert_with_fallback
+ provided by GLib. The GLib function will fail (returns NULL) if any part of
+ the input string is not valid in the declared input encoding. This function
+ however perseveres even in the presence of badly encoded input. */
+char *
+recode_string_pool (const char *to, const char *from,
+ const char *text, int length, struct pool *pool)
+{
+ struct substring out;
+
+ if ( text == NULL )
+ return NULL;
+
+ if ( length == -1 )
+ length = strlen (text);
+
+ out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
+ return out.string;
+}
+
+/* Returns the name of the encoding that should be used for file names.
+
+ This is meant to be the same encoding used by g_filename_from_uri() and
+ g_filename_to_uri() in GLib. */
+static const char *
+filename_encoding (void)
+{
+#if defined _WIN32 || defined __WIN32__
+ return "UTF-8";
+#else
+ return locale_charset ();
+#endif
+}
+
+static char *
+xconcat2 (const char *a, size_t a_len,
+ const char *b, size_t b_len)
+{
+ char *s = xmalloc (a_len + b_len + 1);
+ memcpy (s, a, a_len);
+ memcpy (s + a_len, b, b_len);
+ s[a_len + b_len] = '\0';
+ return s;
+}
+
+/* Conceptually, this function concatenates HEAD_LEN-byte string HEAD and
+ TAIL_LEN-byte string TAIL, both encoded in UTF-8, then converts them to
+ ENCODING. If the re-encoded result is no more than MAX_LEN bytes long, then
+ it returns HEAD_LEN. Otherwise, it drops one character[*] from the end of
+ HEAD and tries again, repeating as necessary until the concatenated result
+ fits or until HEAD_LEN reaches 0.
+
+ [*] Actually this function drops grapheme clusters instead of characters, so
+ that, e.g. a Unicode character followed by a combining accent character
+ is either completely included or completely excluded from HEAD_LEN. See
+ UAX #29 at http://unicode.org/reports/tr29/ for more information on
+ grapheme clusters.
+
+ A null ENCODING is treated as UTF-8.
+
+ Sometimes this function has to actually construct the concatenated string to
+ measure its length. When this happens, it sets *RESULTP to that
+ null-terminated string, allocated with malloc(), for the caller to use if it
+ needs it. Otherwise, it sets *RESULTP to NULL.
+
+ Simple examples for encoding="UTF-8", max_len=6:
+
+ head="abc", tail="xyz" => 3
+ head="abcd", tail="xyz" => 3 ("d" dropped).
+ head="abc", tail="uvwxyz" => 0 ("abc" dropped).
+ head="abc", tail="tuvwxyz" => 0 ("abc" dropped).
+
+ Examples for encoding="ISO-8859-1", max_len=6:
+
+ head="éèä", tail="xyz" => 6
+ (each letter in head is only 1 byte in ISO-8859-1 even though they
+ each take 2 bytes in UTF-8 encoding)
+*/
+static size_t
+utf8_encoding_concat__ (const char *head, size_t head_len,
+ const char *tail, size_t tail_len,
+ const char *encoding, size_t max_len,
+ char **resultp)
+{
+ *resultp = NULL;
+ if (head_len == 0)
+ return 0;
+ else if (encoding == NULL || !c_strcasecmp (encoding, "UTF-8"))
+ {
+ if (head_len + tail_len <= max_len)
+ return head_len;
+ else if (tail_len >= max_len)
+ return 0;
+ else
+ {
+ size_t copy_len;
+ size_t prev;
+ size_t ofs;
+ int mblen;
+
+ copy_len = 0;
+ for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
+ head_len);
+ ofs <= max_len - tail_len;
+ ofs += mblen)
+ {
+ ucs4_t next;
+
+ mblen = u8_mbtouc (&next,
+ CHAR_CAST (const uint8_t *, head + ofs),
+ head_len - ofs);
+ if (uc_is_grapheme_break (prev, next))
+ copy_len = ofs;
+
+ prev = next;
+ }
+
+ return copy_len;
+ }
+ }
+ else
+ {
+ char *result;
+
+ result = (tail_len > 0
+ ? xconcat2 (head, head_len, tail, tail_len)
+ : CONST_CAST (char *, head));
+ if (recode_string_len (encoding, "UTF-8", result,
+ head_len + tail_len) <= max_len)
+ {
+ *resultp = result != head ? result : NULL;
+ return head_len;
+ }
+ else
+ {
+ bool correct_result = false;
+ size_t copy_len;
+ size_t prev;
+ size_t ofs;
+ int mblen;
+
+ copy_len = 0;
+ for (ofs = u8_mbtouc (&prev, CHAR_CAST (const uint8_t *, head),
+ head_len);
+ ofs <= head_len;
+ ofs += mblen)
+ {
+ ucs4_t next;
+
+ mblen = u8_mbtouc (&next,
+ CHAR_CAST (const uint8_t *, head + ofs),
+ head_len - ofs);
+ if (uc_is_grapheme_break (prev, next))
+ {
+ if (result != head)
+ {
+ memcpy (result, head, ofs);
+ memcpy (result + ofs, tail, tail_len);
+ result[ofs + tail_len] = '\0';
+ }
+
+ if (recode_string_len (encoding, "UTF-8", result,
+ ofs + tail_len) <= max_len)
+ {
+ correct_result = true;
+ copy_len = ofs;
+ }
+ else
+ correct_result = false;
+ }
+
+ prev = next;
+ }
+
+ if (result != head)
+ {
+ if (correct_result)
+ *resultp = result;
+ else
+ free (result);
+ }
+
+ return copy_len;
+ }
+ }
+}