+2010-11-13 Bruno Haible <bruno@clisp.org>
+
+ unistr/u8-mbtouc: Improve handling of ill-formed UTF-8 input.
+ * lib/unistr/u8-mbtouc.c (u8_mbtouc): For an invalid multibyte
+ character, return the number of bytes that belong together, not always
+ 1.
+ * lib/unistr/u8-mbtouc-unsafe.c (u8_mbtouc_unsafe): Likewise.
+ * lib/unistr/u8-mbtouc-aux.c (u8_mbtouc_aux): Likewise.
+ * lib/unistr/u8-mbtouc-unsafe-aux.c (u8_mbtouc_unsafe_aux): Likewise.
+ * lib/unistr/u8-mbsnlen.c (u8_mbsnlen): Use u8_mbtouc to determine the
+ number of bytes of an invalid character.
+ * tests/unistr/test-u8-mbtouc.c (test_safe_function): New function.
+ (main): Invoke it.
+ * tests/unistr/test-u8-mbtouc.h (test_function): Update two test results.
+ * tests/unistr/test-u8-mbsnlen.c (main): Test various kinds of
+ malformed byte sequences.
+ * modules/unistr/u8-mbtouc (configure.ac): Bump version number.
+ * modules/unistr/u8-mbtouc-unsafe (configure.ac): Likewise.
+ * modules/unistr/u8-mbsnlen (configure.ac): Likewise.
+ Reported by Ben Pfaff and Paolo Bonzini.
+
2010-11-13 Bruno Haible <bruno@clisp.org>
openat: Work around glibc bug with fchownat() and empty file names.
characters++;
if (count == -2)
break;
- if (count <= 0)
+ if (count < 0)
+ count = u8_mbtouc (&uc, s, n);
+ else if (count == 0)
count = 1;
s += count;
n -= count;
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ return 1;
}
}
else if (c < 0xf0)
{
if (n >= 3)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (c >= 0xe1 || s[1] >= 0xa0)
- && (c != 0xed || s[1] < 0xa0))
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x0f) << 12)
- | ((unsigned int) (s[1] ^ 0x80) << 6)
- | (unsigned int) (s[2] ^ 0x80);
- return 3;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
+ {
+ *puc = ((unsigned int) (c & 0x0f) << 12)
+ | ((unsigned int) (s[1] ^ 0x80) << 6)
+ | (unsigned int) (s[2] ^ 0x80);
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
}
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else
+ return 2;
}
}
else if (c < 0xf8)
{
if (n >= 4)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40
- && (c >= 0xf1 || s[1] >= 0x90)
+ if ((s[1] ^ 0x80) < 0x40)
+ {
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xf1 || s[1] >= 0x90)
#if 1
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
+ && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
#endif
- )
- {
- *puc = ((unsigned int) (c & 0x07) << 18)
- | ((unsigned int) (s[1] ^ 0x80) << 12)
- | ((unsigned int) (s[2] ^ 0x80) << 6)
- | (unsigned int) (s[3] ^ 0x80);
- return 4;
+ )
+ {
+ *puc = ((unsigned int) (c & 0x07) << 18)
+ | ((unsigned int) (s[1] ^ 0x80) << 12)
+ | ((unsigned int) (s[2] ^ 0x80) << 6)
+ | (unsigned int) (s[3] ^ 0x80);
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
}
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
+ return 2;
+ else
+ return 3;
}
}
#if 0
{
if (n >= 5)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (c >= 0xf9 || s[1] >= 0x88))
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x03) << 24)
- | ((unsigned int) (s[1] ^ 0x80) << 18)
- | ((unsigned int) (s[2] ^ 0x80) << 12)
- | ((unsigned int) (s[3] ^ 0x80) << 6)
- | (unsigned int) (s[4] ^ 0x80);
- return 5;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xf9 || s[1] >= 0x88)
+ {
+ *puc = ((unsigned int) (c & 0x03) << 24)
+ | ((unsigned int) (s[1] ^ 0x80) << 18)
+ | ((unsigned int) (s[2] ^ 0x80) << 12)
+ | ((unsigned int) (s[3] ^ 0x80) << 6)
+ | (unsigned int) (s[4] ^ 0x80);
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
}
{
if (n >= 6)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (s[5] ^ 0x80) < 0x40
- && (c >= 0xfd || s[1] >= 0x84))
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x01) << 30)
- | ((unsigned int) (s[1] ^ 0x80) << 24)
- | ((unsigned int) (s[2] ^ 0x80) << 18)
- | ((unsigned int) (s[3] ^ 0x80) << 12)
- | ((unsigned int) (s[4] ^ 0x80) << 6)
- | (unsigned int) (s[5] ^ 0x80);
- return 6;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if ((s[5] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xfd || s[1] >= 0x84)
+ {
+ *puc = ((unsigned int) (c & 0x01) << 30)
+ | ((unsigned int) (s[1] ^ 0x80) << 24)
+ | ((unsigned int) (s[2] ^ 0x80) << 18)
+ | ((unsigned int) (s[3] ^ 0x80) << 12)
+ | ((unsigned int) (s[4] ^ 0x80) << 6)
+ | (unsigned int) (s[5] ^ 0x80);
+ return 6;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 6;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
}
| (unsigned int) (s[1] ^ 0x80);
return 2;
}
+#if CONFIG_UNICODE_SAFETY
/* invalid multibyte character */
+#endif
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ return 1;
}
}
else if (c < 0xf0)
if (n >= 3)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (c >= 0xe1 || s[1] >= 0xa0)
- && (c != 0xed || s[1] < 0xa0))
-#endif
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x0f) << 12)
- | ((unsigned int) (s[1] ^ 0x80) << 6)
- | (unsigned int) (s[2] ^ 0x80);
- return 3;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
+#endif
+ {
+ *puc = ((unsigned int) (c & 0x0f) << 12)
+ | ((unsigned int) (s[1] ^ 0x80) << 6)
+ | (unsigned int) (s[2] ^ 0x80);
+ return 3;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else
+ return 2;
}
}
else if (c < 0xf8)
if (n >= 4)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40
- && (c >= 0xf1 || s[1] >= 0x90)
+ if ((s[1] ^ 0x80) < 0x40)
+ {
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xf1 || s[1] >= 0x90)
#if 1
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
+ && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
#endif
- )
+ )
#endif
- {
- *puc = ((unsigned int) (c & 0x07) << 18)
- | ((unsigned int) (s[1] ^ 0x80) << 12)
- | ((unsigned int) (s[2] ^ 0x80) << 6)
- | (unsigned int) (s[3] ^ 0x80);
- return 4;
+ {
+ *puc = ((unsigned int) (c & 0x07) << 18)
+ | ((unsigned int) (s[1] ^ 0x80) << 12)
+ | ((unsigned int) (s[2] ^ 0x80) << 6)
+ | (unsigned int) (s[3] ^ 0x80);
+ return 4;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
+ return 2;
+ else
+ return 3;
}
}
#if 0
if (n >= 5)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (c >= 0xf9 || s[1] >= 0x88))
-#endif
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x03) << 24)
- | ((unsigned int) (s[1] ^ 0x80) << 18)
- | ((unsigned int) (s[2] ^ 0x80) << 12)
- | ((unsigned int) (s[3] ^ 0x80) << 6)
- | (unsigned int) (s[4] ^ 0x80);
- return 5;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xf9 || s[1] >= 0x88)
+#endif
+ {
+ *puc = ((unsigned int) (c & 0x03) << 24)
+ | ((unsigned int) (s[1] ^ 0x80) << 18)
+ | ((unsigned int) (s[2] ^ 0x80) << 12)
+ | ((unsigned int) (s[3] ^ 0x80) << 6)
+ | (unsigned int) (s[4] ^ 0x80);
+ return 5;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
if (n >= 6)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (s[5] ^ 0x80) < 0x40
- && (c >= 0xfd || s[1] >= 0x84))
-#endif
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x01) << 30)
- | ((unsigned int) (s[1] ^ 0x80) << 24)
- | ((unsigned int) (s[2] ^ 0x80) << 18)
- | ((unsigned int) (s[3] ^ 0x80) << 12)
- | ((unsigned int) (s[4] ^ 0x80) << 6)
- | (unsigned int) (s[5] ^ 0x80);
- return 6;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if ((s[5] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xfd || s[1] >= 0x84)
+#endif
+ {
+ *puc = ((unsigned int) (c & 0x01) << 30)
+ | ((unsigned int) (s[1] ^ 0x80) << 24)
+ | ((unsigned int) (s[2] ^ 0x80) << 18)
+ | ((unsigned int) (s[3] ^ 0x80) << 12)
+ | ((unsigned int) (s[4] ^ 0x80) << 6)
+ | (unsigned int) (s[5] ^ 0x80);
+ return 6;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 6;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
| (unsigned int) (s[1] ^ 0x80);
return 2;
}
+#if CONFIG_UNICODE_SAFETY
/* invalid multibyte character */
+#endif
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ return 1;
}
}
else if (c < 0xf0)
if (n >= 3)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (c >= 0xe1 || s[1] >= 0xa0)
- && (c != 0xed || s[1] < 0xa0))
-#endif
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x0f) << 12)
- | ((unsigned int) (s[1] ^ 0x80) << 6)
- | (unsigned int) (s[2] ^ 0x80);
- return 3;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
+#endif
+ {
+ *puc = ((unsigned int) (c & 0x0f) << 12)
+ | ((unsigned int) (s[1] ^ 0x80) << 6)
+ | (unsigned int) (s[2] ^ 0x80);
+ return 3;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else
+ return 2;
}
}
else if (c < 0xf8)
if (n >= 4)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40
- && (c >= 0xf1 || s[1] >= 0x90)
+ if ((s[1] ^ 0x80) < 0x40)
+ {
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xf1 || s[1] >= 0x90)
#if 1
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
+ && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
#endif
- )
+ )
#endif
- {
- *puc = ((unsigned int) (c & 0x07) << 18)
- | ((unsigned int) (s[1] ^ 0x80) << 12)
- | ((unsigned int) (s[2] ^ 0x80) << 6)
- | (unsigned int) (s[3] ^ 0x80);
- return 4;
+ {
+ *puc = ((unsigned int) (c & 0x07) << 18)
+ | ((unsigned int) (s[1] ^ 0x80) << 12)
+ | ((unsigned int) (s[2] ^ 0x80) << 6)
+ | (unsigned int) (s[3] ^ 0x80);
+ return 4;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
+ return 2;
+ else
+ return 3;
}
}
#if 0
if (n >= 5)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (c >= 0xf9 || s[1] >= 0x88))
-#endif
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x03) << 24)
- | ((unsigned int) (s[1] ^ 0x80) << 18)
- | ((unsigned int) (s[2] ^ 0x80) << 12)
- | ((unsigned int) (s[3] ^ 0x80) << 6)
- | (unsigned int) (s[4] ^ 0x80);
- return 5;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xf9 || s[1] >= 0x88)
+#endif
+ {
+ *puc = ((unsigned int) (c & 0x03) << 24)
+ | ((unsigned int) (s[1] ^ 0x80) << 18)
+ | ((unsigned int) (s[2] ^ 0x80) << 12)
+ | ((unsigned int) (s[3] ^ 0x80) << 6)
+ | (unsigned int) (s[4] ^ 0x80);
+ return 5;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
if (n >= 6)
{
#if CONFIG_UNICODE_SAFETY
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (s[5] ^ 0x80) < 0x40
- && (c >= 0xfd || s[1] >= 0x84))
-#endif
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x01) << 30)
- | ((unsigned int) (s[1] ^ 0x80) << 24)
- | ((unsigned int) (s[2] ^ 0x80) << 18)
- | ((unsigned int) (s[3] ^ 0x80) << 12)
- | ((unsigned int) (s[4] ^ 0x80) << 6)
- | (unsigned int) (s[5] ^ 0x80);
- return 6;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if ((s[5] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xfd || s[1] >= 0x84)
+#endif
+ {
+ *puc = ((unsigned int) (c & 0x01) << 30)
+ | ((unsigned int) (s[1] ^ 0x80) << 24)
+ | ((unsigned int) (s[2] ^ 0x80) << 18)
+ | ((unsigned int) (s[3] ^ 0x80) << 12)
+ | ((unsigned int) (s[4] ^ 0x80) << 6)
+ | (unsigned int) (s[5] ^ 0x80);
+ return 6;
+ }
+#if CONFIG_UNICODE_SAFETY
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 6;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
+#endif
}
else
{
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ return 1;
}
}
else if (c < 0xf0)
{
if (n >= 3)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (c >= 0xe1 || s[1] >= 0xa0)
- && (c != 0xed || s[1] < 0xa0))
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x0f) << 12)
- | ((unsigned int) (s[1] ^ 0x80) << 6)
- | (unsigned int) (s[2] ^ 0x80);
- return 3;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0))
+ {
+ *puc = ((unsigned int) (c & 0x0f) << 12)
+ | ((unsigned int) (s[1] ^ 0x80) << 6)
+ | (unsigned int) (s[2] ^ 0x80);
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
}
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else
+ return 2;
}
}
else if (c < 0xf8)
{
if (n >= 4)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40
- && (c >= 0xf1 || s[1] >= 0x90)
+ if ((s[1] ^ 0x80) < 0x40)
+ {
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((c >= 0xf1 || s[1] >= 0x90)
#if 1
- && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
+ && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
#endif
- )
- {
- *puc = ((unsigned int) (c & 0x07) << 18)
- | ((unsigned int) (s[1] ^ 0x80) << 12)
- | ((unsigned int) (s[2] ^ 0x80) << 6)
- | (unsigned int) (s[3] ^ 0x80);
- return 4;
+ )
+ {
+ *puc = ((unsigned int) (c & 0x07) << 18)
+ | ((unsigned int) (s[1] ^ 0x80) << 12)
+ | ((unsigned int) (s[2] ^ 0x80) << 6)
+ | (unsigned int) (s[3] ^ 0x80);
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 2;
}
/* invalid multibyte character */
}
{
/* incomplete multibyte character */
*puc = 0xfffd;
- return n;
+ if (n == 1 || (s[1] ^ 0x80) >= 0x40)
+ return 1;
+ else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
+ return 2;
+ else
+ return 3;
}
}
#if 0
{
if (n >= 5)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (c >= 0xf9 || s[1] >= 0x88))
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x03) << 24)
- | ((unsigned int) (s[1] ^ 0x80) << 18)
- | ((unsigned int) (s[2] ^ 0x80) << 12)
- | ((unsigned int) (s[3] ^ 0x80) << 6)
- | (unsigned int) (s[4] ^ 0x80);
- return 5;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xf9 || s[1] >= 0x88)
+ {
+ *puc = ((unsigned int) (c & 0x03) << 24)
+ | ((unsigned int) (s[1] ^ 0x80) << 18)
+ | ((unsigned int) (s[2] ^ 0x80) << 12)
+ | ((unsigned int) (s[3] ^ 0x80) << 6)
+ | (unsigned int) (s[4] ^ 0x80);
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
}
{
if (n >= 6)
{
- if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (s[5] ^ 0x80) < 0x40
- && (c >= 0xfd || s[1] >= 0x84))
+ if ((s[1] ^ 0x80) < 0x40)
{
- *puc = ((unsigned int) (c & 0x01) << 30)
- | ((unsigned int) (s[1] ^ 0x80) << 24)
- | ((unsigned int) (s[2] ^ 0x80) << 18)
- | ((unsigned int) (s[3] ^ 0x80) << 12)
- | ((unsigned int) (s[4] ^ 0x80) << 6)
- | (unsigned int) (s[5] ^ 0x80);
- return 6;
+ if ((s[2] ^ 0x80) < 0x40)
+ {
+ if ((s[3] ^ 0x80) < 0x40)
+ {
+ if ((s[4] ^ 0x80) < 0x40)
+ {
+ if ((s[5] ^ 0x80) < 0x40)
+ {
+ if (c >= 0xfd || s[1] >= 0x84)
+ {
+ *puc = ((unsigned int) (c & 0x01) << 30)
+ | ((unsigned int) (s[1] ^ 0x80) << 24)
+ | ((unsigned int) (s[2] ^ 0x80) << 18)
+ | ((unsigned int) (s[3] ^ 0x80) << 12)
+ | ((unsigned int) (s[4] ^ 0x80) << 6)
+ | (unsigned int) (s[5] ^ 0x80);
+ return 6;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 6;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 5;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 4;
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 3;
+ }
+ /* invalid multibyte character */
+ return 2;
}
/* invalid multibyte character */
}
unistr/u8-mbtoucr
configure.ac:
-gl_LIBUNISTRING_MODULE([0.9.3], [unistr/u8-mbsnlen])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mbsnlen])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U8_MBSNLEN
configure.ac:
gl_MODULE_INDICATOR([unistr/u8-mbtouc])
-gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-mbtouc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mbtouc])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U8_MBTOUC
configure.ac:
gl_MODULE_INDICATOR([unistr/u8-mbtouc-unsafe])
-gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-mbtouc-unsafe])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mbtouc-unsafe])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U8_MBTOUC_UNSAFE
}
}
+ /* Test behaviour required by ISO 10646-1, sections R.7 and 2.3c, namely,
+ that a "malformed sequence" is interpreted in the same way as
+ "a character that is outside the adopted subset".
+ Reference:
+ Markus Kuhn: UTF-8 decoder capability and stress test
+ <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
+ <http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html>
+ */
+ /* 3.1. Test that each unexpected continuation byte is signalled as a
+ malformed sequence of its own. */
+ {
+ static const uint8_t input[] = { '"', 0x80, 0xBF, 0x80, 0xBF, '"' };
+ ASSERT (u8_mbsnlen (input, 6) == 6);
+ }
+ /* 3.2. Lonely start characters. */
+ {
+ ucs4_t c;
+ uint8_t input[2];
+
+ for (c = 0xC0; c <= 0xFF; c++)
+ {
+ input[0] = c;
+ input[1] = ' ';
+
+ ASSERT (u8_mbsnlen (input, 2) == 2);
+ }
+ }
+ /* 3.3. Sequences with last continuation byte missing. */
+ /* 3.3.1. 2-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xC0, '"' };
+ ASSERT (u8_mbsnlen (input, 3) == 3);
+ }
+ /* 3.3.6. 2-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xDF, '"' };
+ ASSERT (u8_mbsnlen (input, 3) == 3);
+ }
+ /* 3.3.2. 3-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+ ASSERT (u8_mbsnlen (input, 4) == 3);
+ }
+ /* 3.3.7. 3-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' };
+ ASSERT (u8_mbsnlen (input, 4) == 3);
+ }
+ /* 3.3.3. 4-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+ ASSERT (u8_mbsnlen (input, 5) == 3);
+ }
+ /* 3.3.8. 4-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+ ASSERT (u8_mbsnlen (input, 5) == 3);
+ }
+
return 0;
}
#include "test-u8-mbtouc.h"
+static void
+test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t))
+{
+ ucs4_t uc;
+ int ret;
+
+ /* Test behaviour required by ISO 10646-1, sections R.7 and 2.3c, namely,
+ that a "malformed sequence" is interpreted in the same way as
+ "a character that is outside the adopted subset".
+ Reference:
+ Markus Kuhn: UTF-8 decoder capability and stress test
+ <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
+ <http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html>
+ */
+ /* 3.1. Test that each unexpected continuation byte is signalled as a
+ malformed sequence of its own. */
+ {
+ static const uint8_t input[] = { '"', 0x80, 0xBF, 0x80, 0xBF, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 6);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 5);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 2, 4);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 3, 3);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 4, 2);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 5, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
+ /* 3.2. Lonely start characters. */
+ {
+ ucs4_t c;
+ uint8_t input[2];
+
+ for (c = 0xC0; c <= 0xFF; c++)
+ {
+ input[0] = c;
+ input[1] = ' ';
+
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 2);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ }
+ }
+ /* 3.3. Sequences with last continuation byte missing. */
+ /* 3.3.1. 2-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xC0, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 3);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 2);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 2, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
+ /* 3.3.6. 2-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xDF, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 3);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 2);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 2, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
+ /* 3.3.2. 3-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 4);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 3);
+ ASSERT (ret == 2);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 3, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
+ /* 3.3.7. 3-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 4);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 3);
+ ASSERT (ret == 2);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 3, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
+ /* 3.3.3. 4-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 5);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 4);
+ ASSERT (ret == 3);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 4, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
+ /* 3.3.8. 4-byte sequence with last byte missing. */
+ {
+ static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input, 5);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 1, 4);
+ ASSERT (ret == 3);
+ ASSERT (uc == 0xFFFD);
+ uc = 0xBADFACE;
+ ret = my_u8_mbtouc (&uc, input + 4, 1);
+ ASSERT (ret == 1);
+ ASSERT (uc == 0x0022);
+ }
+}
+
int
main ()
{
test_function (u8_mbtouc);
+ test_safe_function (u8_mbtouc);
return 0;
}
static const uint8_t input[] = { 0xF3, 0xD0, 0xBF };
uc = 0xBADFACE;
ret = my_u8_mbtouc (&uc, input, 3);
- ASSERT (ret == 1 || ret == 3);
+ ASSERT (ret == 1);
ASSERT (uc == 0xFFFD);
}
{
static const uint8_t input[] = { 0xF3, 0x8F, 0xD0 };
uc = 0xBADFACE;
ret = my_u8_mbtouc (&uc, input, 3);
- ASSERT (ret == 1 || ret == 3);
+ ASSERT (ret == 2);
ASSERT (uc == 0xFFFD);
}
}