+2009-06-30 Bruno Haible <bruno@clisp.org>
+
+ Reduce the number of uc_is_cased calls.
+ * lib/unicase.h (casing_suffix_context_t): Add
+ 'first_char_except_ignorable' field.
+ * lib/unicase/context.h (SCC_FINAL_SIGMA_MASK): Remove macro.
+ (SCC_MORE_ABOVE_MASK, SCC_BEFORE_DOT_MASK): Update.
+ * lib/unicase/empty-suffix-context.c (unicase_empty_suffix_context):
+ Update initializer.
+ * lib/unicase/u-casemap.h (FUNC): Don't invoke uc_is_cased on
+ case-ignorable characters.
+ * lib/unicase/u-ct-totitle.h (FUNC): Likewise.
+ * lib/unicase/u-suffix-context.h (FUNC2): Don't call uc_is_cased here.
+ * modules/unicase/u8-suffix-context (Depends-on): Remove unicase/cased.
+ * modules/unicase/u16-suffix-context (Depends-on): Likewise.
+ * modules/unicase/u32-suffix-context (Depends-on): Likewise.
+
2009-06-30 Bruno Haible <bruno@clisp.org>
Tests for module 'unicase/ignorable'.
typedef struct casing_suffix_context
{
/* These fields are private, undocumented. */
+ uint32_t first_char_except_ignorable;
uint32_t bits;
- uint32_t unused_bits;
}
casing_suffix_context_t;
/* The case-mapping context of the empty suffix string. */
casing_suffix_context_t contains the following fields:
// For evaluating the FINAL_SIGMA condition:
- // Bit 0 is set if the suffix starts with a sequence consisting of a
- // case-ignorable sequence and then a cased letter.
- //
+ // First character that was not case-ignorable.
+ ucs4_t first_char_except_ignorable;
+
// For evaluating the MORE_ABOVE condition:
- // Bit 1 is set if the suffix contains a character of combining class
+ // Bit 0 is set if the suffix contains a character of combining class
// 230 (Above) with no character of combining class 0 or 230 (Above)
// before it.
//
// For evaluating the BEFORE_DOT condition:
- // Bit 2 is set if the suffix contains a COMBINING DOT ABOVE (U+0307)
+ // Bit 1 is set if the suffix contains a COMBINING DOT ABOVE (U+0307)
// with no character of combining class 0 or 230 (Above) before it.
//
uint32_t bits;
- */
-#define SCC_FINAL_SIGMA_MASK 1
-#define SCC_MORE_ABOVE_MASK 2
-#define SCC_BEFORE_DOT_MASK 4
+
+ Three bits would be sufficient to carry the context information, but
+ that would require to invoke uc_is_cased ahead of time, more often than
+ actually needed. */
+#define SCC_MORE_ABOVE_MASK 1
+#define SCC_BEFORE_DOT_MASK 2
const casing_suffix_context_t unicase_empty_suffix_context =
{
- 0 /* bits */,
- 0 /* unused_bits */
+ 0xFFFD /* first_char_except_ignorable */,
+ 0 /* bits */
};
{
ucs4_t uc2;
int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
- if (uc_is_cased (uc2))
+ /* Our uc_is_case_ignorable function is
+ known to return false for all cased
+ characters. So we can call
+ uc_is_case_ignorable first. */
+ if (!uc_is_case_ignorable (uc2))
{
- applies = false;
+ applies = ! uc_is_cased (uc2);
break;
}
- if (!uc_is_case_ignorable (uc2))
- break;
s2 += count2;
}
else
{
- applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0);
+ applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
break;
}
}
{
ucs4_t uc2;
int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
- if (uc_is_cased (uc2))
+ /* Our uc_is_case_ignorable function is
+ known to return false for all cased
+ characters. So we can call
+ uc_is_case_ignorable first. */
+ if (!uc_is_case_ignorable (uc2))
{
- applies = false;
+ applies = ! uc_is_cased (uc2);
break;
}
- if (!uc_is_case_ignorable (uc2))
- break;
s2 += count2;
}
else
{
- applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0);
+ applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
break;
}
}
/* Evaluate all three conditions in a single pass through the string S.
The three variables are -1 as long as the value of the condition has
not been determined. */
- int scc_FINAL_SIGMA = -1;
+ ucs4_t first_char_except_ignorable = (ucs4_t)(-1);
int scc_MORE_ABOVE = -1;
int scc_BEFORE_DOT = -1;
const UNIT *s_end = s + n;
ucs4_t uc;
int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
- if (scc_FINAL_SIGMA < 0)
+ if (first_char_except_ignorable == (ucs4_t)(-1))
{
- if (uc_is_cased (uc))
- scc_FINAL_SIGMA = SCC_FINAL_SIGMA_MASK;
- else if (!uc_is_case_ignorable (uc))
- scc_FINAL_SIGMA = 0;
+ if (!uc_is_case_ignorable (uc))
+ first_char_except_ignorable = uc;
}
if (scc_MORE_ABOVE < 0)
}
}
- if ((scc_FINAL_SIGMA | scc_MORE_ABOVE | scc_BEFORE_DOT) >= 0)
+ if (first_char_except_ignorable != (ucs4_t)(-1)
+ && (scc_MORE_ABOVE | scc_BEFORE_DOT) >= 0)
/* All conditions have been determined. */
break;
/* For those conditions that have not been determined so far, use the
value from the argument context. */
+ context.first_char_except_ignorable =
+ (first_char_except_ignorable != (ucs4_t)(-1)
+ ? first_char_except_ignorable
+ : a_context.first_char_except_ignorable);
context.bits =
- (scc_FINAL_SIGMA >= 0
- ? scc_FINAL_SIGMA
- : a_context.bits & SCC_FINAL_SIGMA_MASK)
- | (scc_MORE_ABOVE >= 0
- ? scc_MORE_ABOVE
- : a_context.bits & SCC_MORE_ABOVE_MASK)
+ (scc_MORE_ABOVE >= 0
+ ? scc_MORE_ABOVE
+ : a_context.bits & SCC_MORE_ABOVE_MASK)
| (scc_BEFORE_DOT >= 0
? scc_BEFORE_DOT
: a_context.bits & SCC_BEFORE_DOT_MASK);
Depends-on:
unicase/base
unicase/empty-prefix-context
-unicase/cased
unicase/ignorable
unictype/combining-class
unistr/u16-mbtouc-unsafe
Depends-on:
unicase/base
unicase/empty-prefix-context
-unicase/cased
unicase/ignorable
unictype/combining-class
unistr/u32-mbtouc-unsafe
Depends-on:
unicase/base
unicase/empty-prefix-context
-unicase/cased
unicase/ignorable
unictype/combining-class
unistr/u8-mbtouc-unsafe