From e8da3d80c173b248addd7f6133cacc63b3389c3f Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 7 Aug 2015 10:59:55 -0700 Subject: [PATCH] dump2: Tolerate UTF-8, not just ASCII, in strings. --- Makefile | 2 +- dump2.c | 18 ++++++- u8-mbtouc.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++ u8-mbtouc.h | 5 ++ 4 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 u8-mbtouc.c create mode 100644 u8-mbtouc.h diff --git a/Makefile b/Makefile index 284a8e72fa..355be1141a 100644 --- a/Makefile +++ b/Makefile @@ -7,5 +7,5 @@ dump2.o: CFLAGS := $(base_cflags) -Wno-unused all: dump dump2 parse-xml dump: dump.o -dump2: dump2.o +dump2: dump2.o u8-mbtouc.o parse-xml: parse-xml.o diff --git a/dump2.c b/dump2.c index af01b408cd..d7f8ad3d68 100644 --- a/dump2.c +++ b/dump2.c @@ -7,6 +7,7 @@ #include #include #include +#include "u8-mbtouc.h" static uint8_t *data; static size_t n, pos; @@ -86,10 +87,25 @@ all_ascii(const uint8_t *p) return true; } +static bool +all_utf8(const uint8_t *p) +{ + size_t len = strlen ((char *) p); + for (size_t ofs = 0, mblen; ofs < len; ofs += mblen) + { + ucs4_t uc; + + mblen = u8_mbtouc (&uc, p + ofs, len - ofs); + if (uc < 32 || uc == 127 || uc == 0xfffd) + return false; + } + return true; +} + static char * get_fixed_string(int len, const char *where) { - if (pos + len > n || !memchr(&data[pos], 0, len) || !all_ascii(&data[pos])) + if (pos + len > n || !memchr(&data[pos], 0, len) || !all_utf8(&data[pos])) { fprintf(stderr, "%s: 0x%x: bad fixed-width string\n", where, pos); exit(1); diff --git a/u8-mbtouc.c b/u8-mbtouc.c new file mode 100644 index 0000000000..b41bed6746 --- /dev/null +++ b/u8-mbtouc.c @@ -0,0 +1,139 @@ +/* Look at first character in UTF-8 string. + Copyright (C) 1999-2002, 2006-2007, 2009-2015 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include "u8-mbtouc.h" + +int +u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return 1; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { + if ((s[1] ^ 0x80) < 0x40) + { + if ((s[2] ^ 0x80) < 0x40) + { + if ((c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else + return 2; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { + if ((s[1] ^ 0x80) < 0x40) + { + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else if (n == 2 || (s[2] ^ 0x80) >= 0x40) + return 2; + else + return 3; + } + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} diff --git a/u8-mbtouc.h b/u8-mbtouc.h new file mode 100644 index 0000000000..dbc61e2abb --- /dev/null +++ b/u8-mbtouc.h @@ -0,0 +1,5 @@ +#include +#include + +typedef uint32_t ucs4_t; +int u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n); -- 2.30.2