From a2f806b656f4f5f0693292a814dc992d5f083b46 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 10 Feb 2014 20:59:49 -0800 Subject: [PATCH] i18n: Correctly recode windows-1258 and others with combining diacritics. Without this change, recoding a string from windows-1258 often drops the last character. --- src/libpspp/i18n.c | 90 ++++++++++++++++++++++++++----------------- tests/libpspp/i18n.at | 5 +++ 2 files changed, 60 insertions(+), 35 deletions(-) diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index cdcf57003b..10b3927f9e 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -149,52 +149,72 @@ recode_string_len (const char *to, const char *from, small. */ static ssize_t try_recode (iconv_t conv, - const char *ip, size_t inbytes, - char *op_, size_t outbytes) + const char *in, size_t inbytes, + char *out_, size_t outbytes) { /* FIXME: Need to ensure that this char is valid in the target encoding */ const char fallbackchar = '?'; - char *op = op_; + char *out = out_; + int i; /* Put the converter into the initial shift state, in case there was any state information left over from its last usage. */ iconv (conv, NULL, 0, NULL, 0); - while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes, - &op, &outbytes) == -1) - switch (errno) - { - case EINVAL: - if (outbytes < 2) - return -1; - *op++ = fallbackchar; - *op = '\0'; - return op - op_; - - case EILSEQ: - if (outbytes == 0) - return -1; - *op++ = fallbackchar; - outbytes--; - ip++; - inbytes--; - break; - - case E2BIG: - return -1; - - default: - /* should never happen */ - fprintf (stderr, "Character conversion error: %s\n", strerror (errno)); - NOT_REACHED (); - break; - } + /* Do two rounds of iconv() calls: + + - The first round does the bulk of the conversion using the + caller-supplied input data.. + + - The second round flushes any leftover output. This has a real effect + with input encodings that use combining diacritics, e.g. without the + second round the last character tends to gets dropped when converting + from windows-1258 to other encodings. + */ + for (i = 0; i < 2; i++) + { + ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈ + size_t *inbytesp = i ? NULL : &inbytes; + + while (iconv (conv, inp, inbytesp, &out, &outbytes) == -1) + switch (errno) + { + case EINVAL: + if (outbytes < 2) + return -1; + *out++ = fallbackchar; + *out = '\0'; + return out - out_; + + case EILSEQ: + if (outbytes == 0) + return -1; + *out++ = fallbackchar; + outbytes--; + if (inp) + { + in++; + inbytes--; + } + break; + + case E2BIG: + return -1; + + default: + /* should never happen */ + fprintf (stderr, "Character conversion error: %s\n", + strerror (errno)); + NOT_REACHED (); + break; + } + } if (outbytes == 0) return -1; - *op = '\0'; - return op - op_; + *out = '\0'; + return out - out_; } /* Converts the string TEXT, which should be encoded in FROM-encoding, to a diff --git a/tests/libpspp/i18n.at b/tests/libpspp/i18n.at index 5f4bb65961..974e76f590 100644 --- a/tests/libpspp/i18n.at +++ b/tests/libpspp/i18n.at @@ -32,6 +32,11 @@ CHECK_I18N_RECODE([invalid UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1], CHECK_I18N_RECODE([truncated UTF-8 to ISO-8559-1], [UTF-8], [ISO-8859-1], [xy\302], [xy?]) +# Checks for a bug that caused the last character to be dropped in conversions +# from encodings that have combining diacritics (e.g. windows-1258). +CHECK_I18N_RECODE([dropped final character in windows-1258], [windows-1258], + [UTF8], [aeiou], [aeiou]) + dnl The input to this test is 7 bytes long and the expected output is 9 bytes. dnl So it should exercise the E2BIG case CHECK_I18N_RECODE([from ISO-8859-1 to UTF-8 with overflow], -- 2.30.2