From 1c6891348a71ba48a6419941865904aabf71e824 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Mon, 29 Jun 2009 23:38:05 +0200 Subject: [PATCH] New module 'unicase/u8-prefix-context'. --- ChangeLog | 6 ++ lib/unicase/context.h | 63 +++++++++++++++++++++ lib/unicase/u-prefix-context.h | 92 +++++++++++++++++++++++++++++++ lib/unicase/u8-prefix-context.c | 33 +++++++++++ modules/unicase/u8-prefix-context | 30 ++++++++++ 5 files changed, 224 insertions(+) create mode 100644 lib/unicase/context.h create mode 100644 lib/unicase/u-prefix-context.h create mode 100644 lib/unicase/u8-prefix-context.c create mode 100644 modules/unicase/u8-prefix-context diff --git a/ChangeLog b/ChangeLog index addbe271a3..c5edde69e3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2009-06-29 Bruno Haible + New module 'unicase/u8-prefix-context'. + * lib/unicase/u8-prefix-context.c: New file. + * lib/unicase/u-prefix-context.h: New file. + * lib/unicase/context.h: New file. + * modules/unicase/u8-prefix-context: New file. + New module 'unicase/empty-prefix-context'. * lib/unicase/empty-prefix-context.c: New file. * modules/unicase/empty-prefix-context: New file. diff --git a/lib/unicase/context.h b/lib/unicase/context.h new file mode 100644 index 0000000000..c51a5bd436 --- /dev/null +++ b/lib/unicase/context.h @@ -0,0 +1,63 @@ +/* Case-mapping contexts of UTF-8/UTF-16/UTF-32 substring. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + + +/* The context of a prefix string combines the information of the "Before C" + conditions of the Unicode Standard, + , section 3.13, + table 3-14 "Context Specification for Casing". + + casing_prefix_context_t contains the following fields: + + // Helper for evaluating the FINAL_SIGMA condition: + // Last character that was not case-ignorable. + ucs4_t last_char_except_ignorable; + + // Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: + // Last character that was of combining class 230 ("Above") or 0. + ucs4_t last_char_normal_or_above; + + Three bits would be sufficient to carry the context information, but + that would require to invoke uc_is_cased and uc_is_property_soft_dotted + ahead of time, more often than actually needed. */ + + +/* The context of a suffix string combines the information of the "After C" + conditions of the Unicode Standard, + , section 3.13, + table 3-14 "Context Specification for Casing". + + casing_suffix_context_t contains the following fields: + + // For evaluating the FINAL_SIGMA condition: + // Bit 0 is set if the suffix starts with a sequence consisting of a + // case-ignorable sequence and then a cased letter. + // + // For evaluating the MORE_ABOVE condition: + // Bit 1 is set if the suffix contains a character of combining class + // 230 (Above) with no character of combining class 0 or 230 (Above) + // before it. + // + // For evaluating the BEFORE_DOT condition: + // Bit 2 is set if the suffix contains a COMBINING DOT ABOVE (U+0307) + // with no character of combining class 0 or 230 (Above) before it. + // + uint32_t bits; + */ +#define SCC_FINAL_SIGMA_MASK 1 +#define SCC_MORE_ABOVE_MASK 2 +#define SCC_BEFORE_DOT_MASK 4 diff --git a/lib/unicase/u-prefix-context.h b/lib/unicase/u-prefix-context.h new file mode 100644 index 0000000000..e58b4114e0 --- /dev/null +++ b/lib/unicase/u-prefix-context.h @@ -0,0 +1,92 @@ +/* Case-mapping context of prefix UTF-8/UTF-16/UTF-32 string. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +casing_prefix_context_t +FUNC1 (const UNIT *s, size_t n) +{ + return FUNC2 (s, n, unicase_empty_prefix_context); +} + +casing_prefix_context_t +FUNC2 (const UNIT *s, size_t n, casing_prefix_context_t a_context) +{ +#if 0 + /* Forward iteration. Slow for long strings. */ + casing_prefix_context_t context = a_context; + const UNIT *s_end = s + n; + + while (s < s_end) + { + ucs4_t uc; + int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); + + if (!uc_is_case_ignorable (uc)) + context.last_char_except_ignorable = uc; + + { + int ccc = uc_combining_class (uc); + if (ccc == UC_CCC_A || ccc == UC_CCC_NR) + context.last_char_normal_or_above = uc; + } + + s += count; + } + + return context; +#else + /* Iterate backwards, only as far as needed. */ + casing_prefix_context_t context; + ucs4_t last_char_except_ignorable = (ucs4_t)(-1); + ucs4_t last_char_normal_or_above = (ucs4_t)(-1); + const UNIT *p = s + n; + + for (;;) + { + ucs4_t uc; + p = U_PREV (&uc, p, s); + if (p == NULL) + break; + + if (last_char_except_ignorable == (ucs4_t)(-1)) + { + if (!uc_is_case_ignorable (uc)) + last_char_except_ignorable = uc; + } + + if (last_char_normal_or_above == (ucs4_t)(-1)) + { + int ccc = uc_combining_class (uc); + if (ccc == UC_CCC_A || ccc == UC_CCC_NR) + last_char_normal_or_above = uc; + } + + if (last_char_except_ignorable != (ucs4_t)(-1) + && last_char_normal_or_above != (ucs4_t)(-1)) + break; + } + context.last_char_except_ignorable = + (last_char_except_ignorable != (ucs4_t)(-1) + ? last_char_except_ignorable + : a_context.last_char_except_ignorable); + context.last_char_normal_or_above = + (last_char_normal_or_above != (ucs4_t)(-1) + ? last_char_normal_or_above + : a_context.last_char_normal_or_above); + + return context; +#endif +} diff --git a/lib/unicase/u8-prefix-context.c b/lib/unicase/u8-prefix-context.c new file mode 100644 index 0000000000..c7a3f60943 --- /dev/null +++ b/lib/unicase/u8-prefix-context.c @@ -0,0 +1,33 @@ +/* Case-mapping context of prefix UTF-8 string. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include + +/* Specification. */ +#include "unicase.h" + +#include "unictype.h" +#include "unistr.h" +#include "caseprop.h" +#include "context.h" + +#define FUNC1 u8_casing_prefix_context +#define FUNC2 u8_casing_prefixes_context +#define UNIT uint8_t +#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe +#define U_PREV u8_prev +#include "u-prefix-context.h" diff --git a/modules/unicase/u8-prefix-context b/modules/unicase/u8-prefix-context new file mode 100644 index 0000000000..267bfe6fb7 --- /dev/null +++ b/modules/unicase/u8-prefix-context @@ -0,0 +1,30 @@ +Description: +Case-mapping context of prefix UTF-8 string. + +Files: +lib/unicase/u8-prefix-context.c +lib/unicase/u-prefix-context.h +lib/unicase/context.h + +Depends-on: +unicase/base +unicase/empty-prefix-context +unicase/ignorable +unictype/combining-class +unistr/u8-mbtouc-unsafe +unistr/u8-prev + +configure.ac: + +Makefile.am: +lib_SOURCES += unicase/u8-prefix-context.c + +Include: +"unicase.h" + +License: +LGPL + +Maintainer: +Bruno Haible + -- 2.30.2