From: Bruno Haible Date: Mon, 8 Jan 2007 20:37:38 +0000 (+0000) Subject: New module 'utf8-ucs4-safe'. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ba2ed1073bc563a58132ebc74f56e06b936024b0;p=pspp New module 'utf8-ucs4-safe'. --- diff --git a/ChangeLog b/ChangeLog index 447f99183d..9a18e1ebd9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2006-12-25 Bruno Haible + + * modules/utf8-ucs4-safe: New file. + * lib/utf8-ucs4-safe.h: New file. + * lib/unistr/utf8-ucs4-safe.c: New file. + 2007-01-08 Bruno Haible * modules/utf8-ucs4 (Files, lib_SOURCES): Add unistr/utf8-ucs4.c. diff --git a/lib/unistr/utf8-ucs4-safe.c b/lib/unistr/utf8-ucs4-safe.c new file mode 100644 index 0000000000..fc6a906f44 --- /dev/null +++ b/lib/unistr/utf8-ucs4-safe.c @@ -0,0 +1,156 @@ +/* Conversion UTF-8 to UCS-4. + Copyright (C) 2001-2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published + by the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + USA. */ + +#include + +/* Specification. */ +#include "utf8-ucs4-safe.h" + +int +u8_mbtouc_safe_aux (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xfe) + { + if (n >= 6) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} diff --git a/lib/utf8-ucs4-safe.h b/lib/utf8-ucs4-safe.h new file mode 100644 index 0000000000..ebf864b39b --- /dev/null +++ b/lib/utf8-ucs4-safe.h @@ -0,0 +1,45 @@ +/* Conversion UTF-8 to UCS-4. + Copyright (C) 2001-2002, 2005-2007 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published + by the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + USA. */ + +#ifndef _UTF8_UCS4_SAFE_H +#define _UTF8_UCS4_SAFE_H + +#include +#include "unitypes.h" + +extern int u8_mbtouc_safe_aux (ucs4_t *puc, const uint8_t *s, size_t n); + +/* Return the length (number of units) of the first character in S, putting + its 'ucs4_t' representation in *PUC. + The number of available units, N, must be > 0. */ +static inline int +u8_mbtouc_safe (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else + return u8_mbtouc_safe_aux (puc, s, n); +} + +#endif /* _UTF8_UCS4_SAFE_H */ diff --git a/modules/utf8-ucs4-safe b/modules/utf8-ucs4-safe new file mode 100644 index 0000000000..8c4dd3254a --- /dev/null +++ b/modules/utf8-ucs4-safe @@ -0,0 +1,25 @@ +Description: +Conversion UTF-8 to UCS-4. + +Files: +lib/utf8-ucs4-safe.h +lib/unistr/utf8-ucs4-safe.c +m4/utf-ucs4.m4 + +Depends-on: + +configure.ac: +gl_UTF_UCS4 + +Makefile.am: +lib_SOURCES += utf8-ucs4-safe.h unistr/utf8-ucs4-safe.c + +Include: +"utf8-ucs4-safe.h" + +License: +LGPL + +Maintainer: +Bruno Haible +